From f4e736b7d97399f4c402c52c335a7a0bb836e21f Mon Sep 17 00:00:00 2001 From: neuron-code-sharing-robot Date: Thu, 9 Apr 2026 19:39:50 +0000 Subject: [PATCH] Extracted contents of aws-neuronx-dkms-2.27.4.0.noarch.rpm --- dkms.conf | 2 +- neuron_cdev.c | 333 ++++++++++- neuron_cdev.h | 13 + neuron_core.c | 54 +- neuron_device.h | 10 +- neuron_dhal.c | 14 +- neuron_dhal.h | 34 +- neuron_dma.c | 1011 +++++++++++++++++++++++++++------- neuron_dma.h | 53 +- neuron_dmabuf.c | 3 +- neuron_fw_io.c | 156 +++++- neuron_fw_io.h | 161 +++++- neuron_ioctl.h | 39 +- neuron_mempool.c | 98 +++- neuron_mempool.h | 24 +- neuron_metrics.c | 141 +++-- neuron_metrics.h | 8 +- neuron_module.c | 8 +- neuron_nq.h | 42 ++ neuron_pci.c | 208 ++++++- neuron_pci.h | 7 + neuron_power.c | 2 +- neuron_reg_access.c | 2 +- neuron_reset.c | 18 +- neuron_reset.h | 5 +- neuron_ring.c | 37 +- neuron_ring.h | 112 ++++ neuron_sysfs_metrics.c | 3 +- share/neuron_driver_shared.h | 39 +- udma/udma.h | 44 +- udma/udma_m2m.c | 5 +- udma/udma_main.c | 12 +- udma/udma_regs.h | 10 +- v2/address_map.h | 3 - v2/neuron_dhal_v2.c | 358 ++---------- v2/notific.c | 3 - v2/notific.h | 41 +- v3/address_map.h | 1 - v3/neuron_dhal_v3.c | 463 +++++----------- v3/neuron_pelect.c | 208 ++++--- v3/neuron_pelect.h | 7 + v3/notific.c | 2 - v3/notific.h | 46 +- v4/address_map.h | 1 - v4/neuron_dhal_v4.c | 75 +-- 45 files changed, 2575 insertions(+), 1341 deletions(-) diff --git a/dkms.conf b/dkms.conf index 24de583..ea60afc 100644 --- a/dkms.conf +++ b/dkms.conf @@ -1,5 +1,5 @@ PACKAGE_NAME=aws-neuronx -PACKAGE_VERSION=2.26.5.0 +PACKAGE_VERSION=2.27.4.0 BUILT_MODULE_NAME[0]="neuron" MAKE[0]="make -C ${kernel_source_dir} M=${dkms_tree}/${PACKAGE_NAME}/${PACKAGE_VERSION}/build" CLEAN="make -C ${kernel_source_dir} M=${dkms_tree}/${PACKAGE_NAME}/${PACKAGE_VERSION}/build clean" diff --git a/neuron_cdev.c b/neuron_cdev.c index ad4e3d7..4dfbfe7 100644 --- a/neuron_cdev.c +++ b/neuron_cdev.c @@ -1233,7 +1233,7 @@ static int ncdev_mem_buf_zerocopy64(struct neuron_device *nd, unsigned int cmd, op.buffer = buffer; op.size = size; - ret = ndma_memcpy_zerocopy(nd, nc_id, &op, 1, dev_base, qid, copy_to_mem_handle ? true : false); + ret = ndma_zerocopy_submit(nd, nc_id, &op, 1, dev_base, qid, copy_to_mem_handle ? true : false, 0); } return ret; @@ -1409,7 +1409,7 @@ static int ncdev_mem_buf_zerocopy64_batch(struct neuron_device *nd, void *param) } // use the zero-copy batch function for ops within a single batch - ret = ndma_memcpy_zerocopy(nd, nc_id, batch->ops_ptr, batch->num_ops, dev_base, qid, arg.is_copy_to_device); + ret = ndma_zerocopy_submit(nd, nc_id, batch->ops_ptr, batch->num_ops, dev_base, qid, arg.is_copy_to_device, arg.sequence_num); if (ret) { pr_err("batch zero-copy DMA failed on batch %d on nd%02d: %d\n", i, nd->device_index, ret); goto cleanup; @@ -1502,7 +1502,7 @@ static long ncdev_bar_read(struct neuron_device *nd, u8 bar, u64 *reg_addresses, if (data == NULL) return -ENOMEM; - ret = ndhal->ndhal_reg_access.reg_read32_array((void **)reg_addresses, data, data_count); + ret = ndhal->ndhal_fw_io.fw_io_read_csr_array((void **)reg_addresses, data, data_count, true); if (ret) { kfree(data); return ret; @@ -2933,7 +2933,7 @@ static int ncdev_h2t_dma_alloc_queues(struct neuron_device *nd, unsigned int cmd return -E2BIG; } - if (arg.copy_queue_cnt + arg.service_queue_cnt >= DMA_MAX_Q_MAX) { + if (arg.copy_queue_cnt + arg.service_queue_cnt >= ndhal->ndhal_udma.num_queues) { pr_err("nd%02d: invalid total queue count %d provided", nd->device_index, arg.copy_queue_cnt + arg.service_queue_cnt); return -E2BIG; } @@ -2964,7 +2964,7 @@ static int ncdev_h2t_dma_alloc_queues(struct neuron_device *nd, unsigned int cmd done: if (ret) { u32 combined_queue_bmap = arg.copy_queue_bmap | arg.service_queue_bmap; - for (i=0; i < DMA_MAX_Q_V4; i++) { + for (i=0; i < ndhal->ndhal_udma.num_queues; i++) { if ((1<ndhal_udma.num_queues; i++) { int lret; if ((1<ndhal_perf.perf_set_profile(nd, arg.profile); } +static int ncdev_power_profile_get(struct neuron_device *nd, void *param) +{ + struct neuron_ioctl_power_profile arg; + int ret; + + ret = neuron_copy_from_user(__func__, &arg, (struct neuron_ioctl_power_profile*) param, sizeof(arg)); + if (ret) + return ret; + + if (arg.sz != sizeof(arg)) { + return -ENXIO; + } + if (arg.ctrl != 1) { + return -ENOTSUPP; + } + + ret = ndhal->ndhal_perf.perf_get_profile(nd, &arg.profile); + if (ret) + return ret; + + return copy_to_user(param, &arg, sizeof(arg)); +} + +static int ncdev_available_perf_profiles(struct neuron_device *nd, void *param) +{ + struct neuron_ioctl_available_perf_profiles arg; + int ret; + + ret = neuron_copy_from_user(__func__, &arg, (struct neuron_ioctl_available_perf_profiles*) param, sizeof(arg)); + if (ret) + return ret; + + ret = ndhal->ndhal_perf.perf_get_supported_profiles(nd, arg.requested_feature, &arg.num_profiles, arg.bitmap); + if (ret) + return ret; + + return copy_to_user(param, &arg, sizeof(arg)); +} + + static int ncdev_throttling_notifications_set(struct neuron_device *nd, void *param) { struct neuron_ioctl_throttling_notifications arg; @@ -3053,6 +3093,57 @@ static int ncdev_get_va_placement(void *param) return ret; } +static int ncdev_get_async_h2d_dma_compl_queues(struct neuron_device *nd, void *param) +{ + int ret = 0; + u32 qid = 0; + int eng_id = 0; + struct neuron_ioctl_get_async_h2t_dma_compl_queues arg; + + ret = neuron_copy_from_user(__func__, + &arg, + (struct neuron_ioctl_get_async_h2t_dma_compl_queues *)param, + sizeof(arg)); + if (ret) { + return ret; + } + + /* TODO: start h2d kernel thread */ + + if (arg.nc_id >= ndhal->ndhal_address_map.nc_per_device) { + pr_err("nd%02d: invalid nc %u provided\n", nd->device_index, arg.nc_id); + return -EINVAL; + } + + memset(arg.compl_queue_info, 0, sizeof(arg.compl_queue_info)); + + eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, arg.nc_id); + + for (qid = 0; qid < DMA_MAX_Q_MAX; qid++) { + if (!(arg.qid_bitmap & (1u << qid))) { + continue; + } + + struct ndma_ring *ring = &nd->ndma_engine[eng_id].queues[qid].ring_info; + struct ndma_h2d_compl_queue *compl_queue = &ring->dma_compl_queue; + struct mem_chunk *mc = compl_queue->mc; + + if (!mc) { + pr_err("nd%02d: invalid h2d qid %u; compl queue not initialized\n", + nd->device_index, qid); + return -EINVAL; + } + + arg.compl_queue_info[qid].mmap_offset = nmmap_offset(mc); + arg.compl_queue_info[qid].mmap_size = sizeof(neuron_h2d_dma_compl_queue_t) + + ((compl_queue->capacity_mask + 1) * sizeof(neuron_h2d_dma_compl_queue_entry_t)); + } + + ret = copy_to_user(param, &arg, sizeof(arg)); + + return ret; +} + inline static long ncdev_misc_ioctl(struct file *filep, unsigned int cmd, unsigned long param) { if ((cmd == NEURON_IOCTL_CRWL_NC_RANGE_MARK) || (cmd == NEURON_IOCTL_CRWL_NC_RANGE_MARK_EXT0)) { return ncdev_crwl_nc_range_mark(filep, cmd, (void *)param); @@ -3282,8 +3373,14 @@ static long ncdev_ioctl(struct file *filep, unsigned int cmd, unsigned long para return ncdev_h2t_dma_free_queues(nd, cmd, (void*)param); } else if (cmd == NEURON_IOCTL_POWER_PROFILE) { return ncdev_power_profile_set(nd, (void*)param); + } else if (cmd == NEURON_IOCTL_GET_PERFORMANCE_PROFILE) { + return ncdev_power_profile_get(nd, (void*)param); } else if (cmd == NEURON_IOCTL_THROTTLING_NOTIFICATIONS) { return ncdev_throttling_notifications_set(nd, (void*)param); + } else if (cmd == NEURON_IOCTL_AVAILABLE_PERF_PROFILES) { + return ncdev_available_perf_profiles(nd, (void*)param); + } else if (cmd == NEURON_IOCTL_GET_ASYNC_H2T_DMA_COMPL_QUEUES) { + return ncdev_get_async_h2d_dma_compl_queues(nd, (void*)param); } // B/W compatibility @@ -3374,8 +3471,6 @@ static int ncdev_flush(struct file *filep, fl_owner_t id) // If this proc exited in the middle of a reset, wait for the reset to be processed. nr_wait(nd, task_tgid_nr(current), true); - ndhal->ndhal_cdev.ncdev_quiesce_exec_on_proc_exit(); - ndmar_handle_process_exit(nd, task_tgid_nr(current)); msleep(10); // TODO - confirm with HW dev, whether any delay needed after q reset. ncrwl_release_current_process(nd); @@ -3517,11 +3612,26 @@ static ssize_t neuron_connected_devices_show(struct device *dev, struct device_a static DEVICE_ATTR(connected_devices, S_IRUSR, neuron_connected_devices_show, NULL); +static ssize_t fw_api_version_show(struct device *dev, struct device_attribute *attr, char *buf) +{ int fw_api_version; + int minor = MINOR(dev->devt); + struct neuron_device *nd = devnodes[minor].ndev; + + fw_io_api_version_read(nd->npdev.bar0, &fw_api_version); + if (fw_api_version == 0xdeadbeef) { // the value is not readable during reset, try later + return sprintf(buf, "busy\n"); + } + return sprintf(buf, "%u\n", fw_api_version); +} + +static DEVICE_ATTR(fw_api_version, S_IRUGO, fw_api_version_show, NULL); + static struct attribute *attrs[] = { &dev_attr_reset.attr, &dev_attr_core_count.attr, &dev_attr_connected_devices.attr, - NULL, + &dev_attr_fw_api_version.attr, + NULL, }; static struct attribute_group attr_group = { @@ -3620,14 +3730,22 @@ int ncdev_delete_device_node(struct neuron_device *ndev) /* * neuron_device class sysfs nodes - * node_id_2/4 - * node_cnt_2/4 - * server_id_2/4 + * ULTRASERVER + * node_id_2/4 + * server_id_2/4 + * ultraserver_mode + * + * PDS + * node_id + * node_cnt + * reservation_id + * ultraserver_mode * */ struct ncdev_class_attr { struct class_attribute attr; + enum neuron_platform_type platform_type; u32 info; }; @@ -3649,6 +3767,24 @@ static ssize_t ncdev_class_node_id_show(struct class *class, struct class_attrib return ndhal->ndhal_npe.npe_class_node_id_show_data(buf, ca->info); } +#if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 5))) +static ssize_t ncdev_class_node_cnt_show(const struct class *class, const struct class_attribute *attr, char *buf) +#else +static ssize_t ncdev_class_node_cnt_show(struct class *class, struct class_attribute *attr, char *buf) +#endif +{ + //struct ncdev_class_attr *ca = container_of(attr, struct ncdev_class_attr, attr); + + // protect against ndhal initialization race + if (ndhal == NULL) { + return 0; + } + if (ndhal->ndhal_npe.npe_class_node_cnt_show_data == NULL) { + return 0; + } + return ndhal->ndhal_npe.npe_class_node_cnt_show_data(buf); +} + #if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 5))) static ssize_t ncdev_class_server_id_show(const struct class *class, const struct class_attribute *attr, char *buf) #else @@ -3667,6 +3803,23 @@ static ssize_t ncdev_class_server_id_show(struct class *class, struct class_attr return ndhal->ndhal_npe.npe_class_server_id_show_data(buf, ca->info); } +#if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 5))) +static ssize_t ncdev_class_reservation_id_show(const struct class *class, const struct class_attribute *attr, char *buf) +#else +static ssize_t ncdev_class_reservation_id_show(struct class *class, struct class_attribute *attr, char *buf) +#endif +{ + struct ncdev_class_attr *ca = container_of(attr, struct ncdev_class_attr, attr); + + // protect against ndhal initialization race + if (ndhal == NULL) { + return 0; + } + if (ndhal->ndhal_npe.npe_class_server_id_show_data == NULL) { + return 0; + } + return ndhal->ndhal_npe.npe_class_server_id_show_data(buf, ca->info); +} #if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 5))) static ssize_t ncdev_class_ultraserver_mode_show(const struct class *class, const struct class_attribute *attr, char *buf) @@ -3684,15 +3837,74 @@ static ssize_t ncdev_class_ultraserver_mode_show(struct class *class, struct cla return ndhal->ndhal_npe.npe_class_ultraserver_mode_show_data(buf); } -#define NCDEV_CLASS_ATTR(name, f, i) \ - {__ATTR(name, S_IRUGO, f, NULL), i} +#if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 5))) +static ssize_t ncdev_class_hbm_7200_show(const struct class *class, const struct class_attribute *attr, char *buf) +#else +static ssize_t ncdev_class_hbm_7200_show(struct class *class, struct class_attribute *attr, char *buf) +#endif +{ + int i; + int supports_hbm_7200 = 1; + if (total_neuron_devices == 0) { + return dhal_sysfs_emit(buf, "busy\n"); + } + + for (i = 0; i < total_neuron_devices; i++) { + if (neuron_devices[i]->supports_hbm_7200 == -1) { + return dhal_sysfs_emit(buf, "busy\n"); + } + supports_hbm_7200 = supports_hbm_7200 & neuron_devices[i]->supports_hbm_7200; + } + + return dhal_sysfs_emit(buf, "%d\n", (supports_hbm_7200) ? 1 : 0); +} + +#if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 5))) +static ssize_t ncdev_class_cur_perf_profile_show(const struct class *class, const struct class_attribute *attr, char *buf) +#else +static ssize_t ncdev_class_cur_perf_profile_show(struct class *class, struct class_attribute *attr, char *buf) +#endif +{ + int i; + int cur_perf_profile; + if (total_neuron_devices == 0) { + return dhal_sysfs_emit(buf, "busy\n"); + } + + cur_perf_profile = neuron_devices[0]->current_perf_profile; + for (i = 1; i < total_neuron_devices; i++) { + if (neuron_devices[i]->current_perf_profile != cur_perf_profile) { + return dhal_sysfs_emit(buf, "-1\n"); + } + } + return dhal_sysfs_emit(buf, "%d\n", cur_perf_profile); +} + +#define NCDEV_CLASS_ATTR(name, f, p, i) \ + {__ATTR(name, S_IRUGO, f, NULL), p, i} static const struct ncdev_class_attr ncdev_class_attrs[] = { - NCDEV_CLASS_ATTR(node_id_2, ncdev_class_node_id_show, 2), - NCDEV_CLASS_ATTR(node_id_4, ncdev_class_node_id_show, 4), - NCDEV_CLASS_ATTR(server_id_2, ncdev_class_server_id_show, 2), - NCDEV_CLASS_ATTR(server_id_4, ncdev_class_server_id_show, 4), - NCDEV_CLASS_ATTR(ultraserver_mode, ncdev_class_ultraserver_mode_show, 0) + NCDEV_CLASS_ATTR(hbm_7200_capable, ncdev_class_hbm_7200_show, NEURON_PLATFORM_TYPE_STD, 0), + NCDEV_CLASS_ATTR(current_perf_profile, ncdev_class_cur_perf_profile_show, NEURON_PLATFORM_TYPE_STD, 0), +}; + +static const struct ncdev_class_attr ncdev_class_attrs_us[] = { + NCDEV_CLASS_ATTR(node_id_2, ncdev_class_node_id_show, NEURON_PLATFORM_TYPE_ULTRASERVER, 2), + NCDEV_CLASS_ATTR(node_id_4, ncdev_class_node_id_show, NEURON_PLATFORM_TYPE_ULTRASERVER, 4), + NCDEV_CLASS_ATTR(server_id_2, ncdev_class_server_id_show, NEURON_PLATFORM_TYPE_ULTRASERVER, 2), + NCDEV_CLASS_ATTR(server_id_4, ncdev_class_server_id_show, NEURON_PLATFORM_TYPE_ULTRASERVER, 4), + NCDEV_CLASS_ATTR(ultraserver_mode, ncdev_class_ultraserver_mode_show, NEURON_PLATFORM_TYPE_ULTRASERVER, 0), + NCDEV_CLASS_ATTR(hbm_7200_capable, ncdev_class_hbm_7200_show, NEURON_PLATFORM_TYPE_STD, 0), + NCDEV_CLASS_ATTR(current_perf_profile, ncdev_class_cur_perf_profile_show, NEURON_PLATFORM_TYPE_STD, 0), +}; + +static const struct ncdev_class_attr ncdev_class_attrs_pds[] = { + NCDEV_CLASS_ATTR(node_id, ncdev_class_node_id_show, NEURON_PLATFORM_TYPE_PDS, 0), + NCDEV_CLASS_ATTR(node_cnt, ncdev_class_node_cnt_show, NEURON_PLATFORM_TYPE_PDS, 0), + NCDEV_CLASS_ATTR(reservation_id, ncdev_class_reservation_id_show, NEURON_PLATFORM_TYPE_PDS, 0), + NCDEV_CLASS_ATTR(ultraserver_mode, ncdev_class_ultraserver_mode_show, NEURON_PLATFORM_TYPE_PDS, 0), + NCDEV_CLASS_ATTR(hbm_7200_capable, ncdev_class_hbm_7200_show, NEURON_PLATFORM_TYPE_STD, 0), + NCDEV_CLASS_ATTR(current_perf_profile, ncdev_class_cur_perf_profile_show, NEURON_PLATFORM_TYPE_STD, 0), }; static const struct class_attribute class_attr_node_id = @@ -3704,6 +3916,79 @@ static const struct class_attribute class_attr_server_id = static const struct class_attribute class_attr_ultraserver_mode = __ATTR(ultraserver_mode, S_IRUGO, ncdev_class_ultraserver_mode_show, NULL); +// per platform class attributes. TODO we may eventually want to split this out into a neuron_platform.c +// +static const struct { + const struct ncdev_class_attr *class_attrs; + int class_attrs_cnt; + enum neuron_platform_type platform_type; + } ncdev_platform_class_attrs[] = { + {ncdev_class_attrs, sizeof(ncdev_class_attrs) / sizeof(*ncdev_class_attrs), NEURON_PLATFORM_TYPE_STD}, + {ncdev_class_attrs_us, sizeof(ncdev_class_attrs_us) / sizeof(*ncdev_class_attrs_us), NEURON_PLATFORM_TYPE_ULTRASERVER}, + {ncdev_class_attrs_pds, sizeof(ncdev_class_attrs_pds) / sizeof(*ncdev_class_attrs_pds), NEURON_PLATFORM_TYPE_PDS}, + {NULL, 0, NEURON_PLATFORM_TYPE_INVALID}}; + +int ncdev_class_attr_init(void) +{ + int i; + int ret; + + if (neuron_dev_class) { + const struct ncdev_class_attr *class_attrs = NULL; + int class_attrs_cnt; + + for (i = 0; i < sizeof(ncdev_platform_class_attrs) / sizeof(*ncdev_platform_class_attrs); i++) { + if (ncdev_platform_class_attrs[i].platform_type == ndhal->ndhal_arch.platform_type) { + class_attrs = ncdev_platform_class_attrs[i].class_attrs; + class_attrs_cnt = ncdev_platform_class_attrs[i].class_attrs_cnt; + } + } + + // no class attributes for this platform type + if (class_attrs == NULL) { + return 0; + } + + for (i = 0; i < class_attrs_cnt; i++) { + ret = class_create_file(neuron_dev_class, &class_attrs[i].attr); + if (ret) { + pr_err("create class/%s failed", class_attrs[i].attr.attr.name); + goto fail; + } + } + } + return 0; + +fail: + return ret; +} + +void ncdev_class_attr_cleanup(void) +{ + int i; + + if (neuron_dev_class) { + const struct ncdev_class_attr *class_attrs = NULL; + int class_attrs_cnt; + + for (i = 0; i < sizeof(ncdev_platform_class_attrs) / sizeof(*ncdev_platform_class_attrs); i++) { + if (ncdev_platform_class_attrs[i].platform_type == ndhal->ndhal_arch.platform_type) { + class_attrs = ncdev_platform_class_attrs[i].class_attrs; + class_attrs_cnt = ncdev_platform_class_attrs[i].class_attrs_cnt; + } + } + + // no class attributes for this platform type + if (class_attrs == NULL) { + return; + } + + for (i = 0; i < class_attrs_cnt; i++) { + class_remove_file(neuron_dev_class, &class_attrs[i].attr); + } + } +} + static void ncdev_cleanup(void) { int i; @@ -3713,9 +3998,6 @@ static void ncdev_cleanup(void) } if (neuron_dev_class) { - for (i = 0; i < sizeof(ncdev_class_attrs) / sizeof(*ncdev_class_attrs); i++) { - class_remove_file(neuron_dev_class, &ncdev_class_attrs[i].attr); - } class_destroy(neuron_dev_class); } @@ -3748,13 +4030,6 @@ int ncdev_module_init(void) goto fail; } - for (i = 0; i < sizeof(ncdev_class_attrs) / sizeof(*ncdev_class_attrs); i++) { - ret = class_create_file(neuron_dev_class, &ncdev_class_attrs[i].attr); - if (ret) { - pr_err("create class/%s failed", ncdev_class_attrs[i].attr.attr.name); - goto fail; - } - } return ret; fail: diff --git a/neuron_cdev.h b/neuron_cdev.h index f901a3e..0e24758 100644 --- a/neuron_cdev.h +++ b/neuron_cdev.h @@ -40,6 +40,19 @@ int ncdev_create_device_node(struct neuron_device *ndev); */ int ncdev_delete_device_node(struct neuron_device *ndev); +/** + * ncdev_class_attr_init() - initialize global class attributes + * + * @return int: return 0 on success, otherwise failure + */ +int ncdev_class_attr_init(void); + +/** + * ncdev_class_attr_cleanup() - cleanup glboal class attributes + * + */ +void ncdev_class_attr_cleanup(void); + /** * ncdev_module_init() - Initialize the kernel module that creates the character devices * diff --git a/neuron_core.c b/neuron_core.c index 89c3d4a..a15718e 100644 --- a/neuron_core.c +++ b/neuron_core.c @@ -49,24 +49,36 @@ DECLARE_FAULT_ATTR(neuron_fail_nc_mmap); int nc_semaphore_read(struct neuron_device *nd, u8 nc_id, u16 semaphore_index, u32 *result) { + int ret = 0; void *addr; if (semaphore_index >= ndhal->ndhal_address_map.semaphore_count) return -EINVAL; - addr = ndhal->ndhal_nc.nc_get_semaphore_base(nd, nc_id); + ret = ndhal->ndhal_nc.nc_get_semaphore_base(nd, nc_id, &addr); + if (ret) { + pr_err("failed to retrieve semaphore base"); + return ret; + } + addr += ndhal->ndhal_address_map.mmap_nc_sema_read_offset + (semaphore_index * NC_SEMAPHORE_SIZE); - return ndhal->ndhal_reg_access.reg_read32_array((void **)&addr, result, 1); + return ndhal->ndhal_fw_io.fw_io_read_csr_array((void **)&addr, result, 1, true); } int nc_semaphore_write(struct neuron_device *nd, u8 nc_id, u16 semaphore_index, u32 value) { + int ret = 0; void *addr; if (semaphore_index >= ndhal->ndhal_address_map.semaphore_count) return -EINVAL; - addr = ndhal->ndhal_nc.nc_get_semaphore_base(nd, nc_id); + ret = ndhal->ndhal_nc.nc_get_semaphore_base(nd, nc_id, &addr); + if (ret) { + pr_err("failed to retrieve semaphore base"); + return ret; + } + addr += ndhal->ndhal_address_map.mmap_nc_sema_set_offset + (semaphore_index * NC_SEMAPHORE_SIZE); writel(value, addr); return 0; @@ -74,12 +86,18 @@ int nc_semaphore_write(struct neuron_device *nd, u8 nc_id, u16 semaphore_index, int nc_semaphore_increment(struct neuron_device *nd, u8 nc_id, u16 semaphore_index, u32 value) { + int ret = 0; void *addr; if (semaphore_index >= ndhal->ndhal_address_map.semaphore_count) return -EINVAL; - addr = ndhal->ndhal_nc.nc_get_semaphore_base(nd, nc_id); + ret = ndhal->ndhal_nc.nc_get_semaphore_base(nd, nc_id, &addr); + if (ret) { + pr_err("failed to retrieve semaphore base"); + return ret; + } + addr += ndhal->ndhal_address_map.mmap_nc_sema_incr_offset + (semaphore_index * NC_SEMAPHORE_SIZE); writel(value, addr); return 0; @@ -87,12 +105,18 @@ int nc_semaphore_increment(struct neuron_device *nd, u8 nc_id, u16 semaphore_ind int nc_semaphore_decrement(struct neuron_device *nd, u8 nc_id, u16 semaphore_index, u32 value) { + int ret = 0; void *addr; if (semaphore_index >= ndhal->ndhal_address_map.semaphore_count) return -EINVAL; - addr = ndhal->ndhal_nc.nc_get_semaphore_base(nd, nc_id); + ret = ndhal->ndhal_nc.nc_get_semaphore_base(nd, nc_id, &addr); + if (ret) { + pr_err("failed to retrieve semaphore base"); + return ret; + } + addr += ndhal->ndhal_address_map.mmap_nc_sema_decr_offset + (semaphore_index * NC_SEMAPHORE_SIZE); writel(value, addr); return 0; @@ -100,23 +124,35 @@ int nc_semaphore_decrement(struct neuron_device *nd, u8 nc_id, u16 semaphore_ind int nc_event_get(struct neuron_device *nd, u8 nc_id, u16 event_index, u32 *result) { + int ret = 0; void *addr; if (event_index > ndhal->ndhal_address_map.event_count) return -EINVAL; - addr = ndhal->ndhal_nc.nc_get_event_addr(nd, nc_id, event_index); - return ndhal->ndhal_reg_access.reg_read32_array(&addr, result, 1); + ret = ndhal->ndhal_nc.nc_get_event_addr(nd, nc_id, event_index, &addr); + if (ret) { + pr_err("failed to retrieve event %u addr", event_index); + return ret; + } + + return ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, result, 1, true); } int nc_event_set(struct neuron_device *nd, u8 nc_id, u16 event_index, u32 value) { - u32 *addr; + int ret = 0; + void *addr; if (event_index > ndhal->ndhal_address_map.event_count) return -EINVAL; - addr = ndhal->ndhal_nc.nc_get_event_addr(nd, nc_id, event_index); + ret = ndhal->ndhal_nc.nc_get_event_addr(nd, nc_id, event_index, &addr); + if (ret) { + pr_err("failed to retrieve event %u addr", event_index); + return ret; + } + writel(value, addr); return 0; } diff --git a/neuron_device.h b/neuron_device.h index 93fd781..8af4f29 100644 --- a/neuron_device.h +++ b/neuron_device.h @@ -85,7 +85,7 @@ struct neuron_device { void *fw_io_ctx; - struct mempool_set mpset; + struct neuron_mempool_set mpset; // memory chunk allocated for notification queue in each neuron core. struct mem_chunk *nq_mc[MAX_NC_PER_DEVICE][MAX_NQ_SUPPORTED]; @@ -118,6 +118,14 @@ struct neuron_device { struct neuron_log_obj log_obj; // logging object struct neuron_hbm_scrub_ctx hbm_scrub_ctx; + + // volatile to prevent compiler optimizations since accessed by different threads + // Indicates whether any performance profile with 7200 Mhz HBM is supported by this device + volatile int supports_hbm_7200; + + // volatile to prevent compiler optimizations since accessed by different threads + // This is the true value per-device, instead of the global one in ndhal_perf used only for metrics + volatile int current_perf_profile; }; #endif diff --git a/neuron_dhal.c b/neuron_dhal.c index bb269c2..179cb8e 100644 --- a/neuron_dhal.c +++ b/neuron_dhal.c @@ -2,6 +2,7 @@ #include "neuron_arch.h" #include "neuron_dhal.h" +#include "neuron_cdev.h" struct neuron_dhal *ndhal = NULL; @@ -24,10 +25,9 @@ int neuron_dhal_init(unsigned int pci_device_id) { return -ENOMEM; } } else { - mutex_unlock(&ndhal_init_lock); - return 0; + mutex_unlock(&ndhal_init_lock); + return 0; } - mutex_unlock(&ndhal_init_lock); ndhal->ndhal_arch.arch = narch_get_arch(); ndhal->pci_device_id = pci_device_id; @@ -46,15 +46,23 @@ int neuron_dhal_init(unsigned int pci_device_id) { break; default: pr_err("Unknown HW architecture: %d. Can't init neuron_dhal.\n", ndhal->ndhal_arch.arch); + mutex_unlock(&ndhal_init_lock); return -EINVAL; } + // global class attributes get delayed initialization - need platform data from dhal + ncdev_class_attr_init(); + + mutex_unlock(&ndhal_init_lock); + return ret; } void neuron_dhal_cleanup(void) { if (ndhal) { + ncdev_class_attr_cleanup(); + if (ndhal->ndhal_ext_cleanup) { ndhal->ndhal_ext_cleanup(); } diff --git a/neuron_dhal.h b/neuron_dhal.h index ab34019..dbce141 100644 --- a/neuron_dhal.h +++ b/neuron_dhal.h @@ -30,7 +30,6 @@ struct ndhal_arch { struct ndhal_address_map { // addresses uint64_t pci_host_base; - uint64_t mmap_p_offset; uint64_t mmap_nc_event_offset; uint64_t mmap_nc_sema_read_offset; uint64_t mmap_nc_sema_set_offset; @@ -39,9 +38,6 @@ struct ndhal_address_map { uint64_t bar0_misc_ram_offset; uint64_t port_1_base; - // sizes - uint64_t mmap_nc_size; - // counts int nc_per_device; unsigned dice_per_device; @@ -56,13 +52,12 @@ struct ndhal_address_map { }; struct ndhal_reset { - uint64_t reset_poll_interval; uint64_t reset_tpb_initial_poll_delay; uint64_t initiate_max_wait_time; uint32_t retry_count; int (*nr_initiate_reset) (struct neuron_device *nd, uint32_t nc_map); int (*nr_wait_for_reset_completion) (struct neuron_device *nd); - int (*nr_post_reset_config) (struct neuron_device *nd, bool reset_successful); + int (*nr_post_reset_config) (struct neuron_device *nd, bool reset_successful, bool is_no_reset); }; struct ndhal_topsp { @@ -75,8 +70,8 @@ struct ndhal_topsp { }; struct ndhal_nc { - void *(*nc_get_semaphore_base) (struct neuron_device *nd, u8 nc_id); - void *(*nc_get_event_addr) (struct neuron_device *nd, u8 nc_id, u16 event_index); + int (*nc_get_semaphore_base) (struct neuron_device *nd, u8 nc_id, void **sem_base); + int (*nc_get_event_addr) (struct neuron_device *nd, u8 nc_id, u16 event_index, void **ev_addr); }; struct ndhal_nq { @@ -89,8 +84,7 @@ struct ndhal_mpset { u64 device_dram_effective_base_addr[MAX_DRAM_CHANNELS]; u64 device_dram_end_addr[MAX_DRAM_CHANNELS]; bool small_pool_supported; - void (*mpset_set_dram_and_mpset_info) (struct mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size); - int (*mpset_block_carveout_regions) (struct neuron_device *nd, struct mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size); + void (*mpset_set_dram_and_mpset_info) (struct neuron_mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size); }; struct ndhal_ndmar { @@ -100,7 +94,6 @@ struct ndhal_ndmar { bool (*nr_init_h2t_eng) ( int nc_idx, uint32_t nc_map); bool (*ndmar_is_nx_ring) (uint32_t eng_id, uint32_t q_id); int (*ndmar_quiesce_queues) (struct neuron_device *nd, u32 nc_id, u32 engine_count, u32 *queue_mask); - void (*ndmar_set_model_started) (struct neuron_device *nd, phys_addr_t pa, struct mem_chunk *mc); }; struct ndhal_fw_io { @@ -111,10 +104,6 @@ struct ndhal_fw_io { int (*fw_io_post_metric) (struct fw_io_ctx *ctx, u8 *data, u32 size); }; -struct ndhal_reg_access { - int (*reg_read32_array) (void **addr, u32 *value, u32 num_values); -}; - struct ndhal_mmap { struct neuron_dm_special_mmap_ent *dm_mmap_special; int (*mmap_get_bar4_offset) (u64 start_addr, u64 size, u64 *offset); @@ -150,14 +139,6 @@ struct ndhal_pci { int dram_bar; u64 dram_bar_size; - int (*neuron_pci_release_bar) (struct pci_dev *dev, int bar); - int (*neuron_pci_reserve_bar) (struct pci_dev *dev, int bar, const char *res_name); - int (*neuron_pci_set_npdev) (struct pci_dev *dev, - int bar, - const char *res_name, - phys_addr_t *bar_pa, - void __iomem **bar_ioaddr, - u64 *bar_size); int (*neuron_pci_get_device_id) (struct neuron_device *nd, struct pci_dev *dev); int (*neuron_pci_device_id_to_rid_map) (uint32_t * count, uint32_t * did_to_rid_map); }; @@ -167,12 +148,12 @@ struct ndhal_cdev { u64 *ncdev_bar0_write_blocked_addrs; void (*ncdev_compatible_version) (struct neuron_ioctl_compatible_version *arg); - void (*ncdev_quiesce_exec_on_proc_exit) (void); int (*ncdev_logical_to_physical_nc_map)(struct neuron_ioctl_nc_map *map, uint32_t max_num_entries, enum neuron_ioctl_nc_mapping_type mapping_type); void (*ncdev_get_default_tpbs_for_hbm) (u32 hbm_index, u32 tpbs[MAX_NC_PER_DEVICE], u32 *tpb_count); }; struct ndhal_udma { + unsigned int num_queues; unsigned int num_beats; }; @@ -193,6 +174,7 @@ struct ndhal_npe { int (*npe_pod_status)( u32 *pod_state, s8 *node_id); int (*npe_pod_ctrl)( struct neuron_device *nd, u32 pod_ctrl, enum neuron_ultraserver_mode mode, u32 timeout, u32 *pod_state); ssize_t (*npe_class_node_id_show_data)(char *buf, u32 sz); + ssize_t (*npe_class_node_cnt_show_data)(char *buf); ssize_t (*npe_class_server_id_show_data)(char *buf, u32 sz); ssize_t (*npe_class_ultraserver_mode_show_data)(char *buf); u32 (*npe_neighbor_eng_ids)[2]; @@ -217,6 +199,9 @@ struct ndhal_tpb { struct ndhal_perf { int current_performance_profile; int (*perf_set_profile) (struct neuron_device *nd, uint32_t profile); + int (*perf_get_profile) (struct neuron_device *nd, uint32_t *profile); + int (*perf_get_supported_profiles) (struct neuron_device *nd, u16 feature, u8 *num_profiles, u8 out_bitmap[32]); + void (*perf_update_hbm_7200_supported) (struct neuron_device *nd); }; struct neuron_dhal { @@ -231,7 +216,6 @@ struct neuron_dhal { struct ndhal_mpset ndhal_mpset; struct ndhal_ndmar ndhal_ndmar; struct ndhal_fw_io ndhal_fw_io; - struct ndhal_reg_access ndhal_reg_access; struct ndhal_mmap ndhal_mmap; struct ndhal_sysfs_metrics ndhal_sysfs_metrics; struct ndhal_pci ndhal_pci; diff --git a/neuron_dma.c b/neuron_dma.c index 8258605..32e7d43 100644 --- a/neuron_dma.c +++ b/neuron_dma.c @@ -9,12 +9,15 @@ #include #include #include +#include +#include #include "udma/udma.h" #include "neuron_trace.h" #include "neuron_device.h" #include "neuron_dma.h" #include "neuron_mempool.h" +#include "neuron_mmap.h" #include "neuron_dhal.h" #include "neuron_pci.h" @@ -233,6 +236,10 @@ int ndma_memcpy_wait_for_completion(struct ndma_eng *eng, struct ndma_ring *ring u64 first_wait_time, wait; ndhal->ndhal_ndma.ndma_get_wait_for_completion_time(count, async, &first_wait_time, &wait); + // Increase the wait time on virtual platforms + if (narch_is_qemu() || narch_is_emu()) { + wait = wait * 100 * 1000; + } if (is_intra_device_dma && !async) { first_wait_time = 10; // device-to-device DMA is much faster, just choose a small value independent of number of descriptors wait = wait/200; // can probably be set even lower if required @@ -271,7 +278,7 @@ int ndma_memcpy_wait_for_completion(struct ndma_eng *eng, struct ndma_ring *ring } if (i > loop) { pr_err("DMA completion timeout on nd%02d for %s q%d desc count %u\n", eng->nd->device_index, eng->udma.name, ring->qid, count); - ret = -1; + ret = -ETIMEDOUT; goto error; } @@ -876,7 +883,7 @@ int ndma_bar0_blocked_one_engine(u64 base, u64 off) q_start = base + offsetof(struct unit_regs_v4, s2m); // start of s2m block q_start += offsetof(struct udma_s2m_regs_v4, s2m_q); // start of q registers } - for (qid = 0; qid < DMA_MAX_Q_V4; qid++) { + for (qid = 0; qid < ndhal->ndhal_udma.num_queues; qid++) { u64 q_off = q_start + q_size * qid; int i; for (i = 0; i < sizeof(udma_blocked) / sizeof(udma_blocked[0]); i++) { @@ -891,37 +898,460 @@ int ndma_bar0_blocked_one_engine(u64 base, u64 off) /* * Zero copy impementation. - * - * - * */ +/* Context for tracking a single tensor batch operation on submit flow */ +struct ndma_h2t_zcdma_op_context { + void *host_addr; + dma_addr_t dev_addr; + u64 offset; + u64 pin_size; + u64 remaining; +}; + +/* DMA context state */ +enum ndma_zcdma_state { + NDMA_INVALID = 0, + NDMA_UNPINNED, + NDMA_PINNED_UNSUBMITTED, + NDMA_SUBMITTED, + NDMA_COMPLETED, // not in dma context queue anymore +}; + +/* DMA context */ struct ndma_h2t_zcdma_context { - struct ndma_eng *eng; // engine - struct ndma_ring *ring; // - void *host_addr; // host address - dma_addr_t dev_addr; // device address - u64 size; // size for this transfer - bool direction; // direction. true = to device - bool last; // last transfer for the entire request. - u64 start_time; // start time for this transfer - int nr_pages; // number of pages for this transfer - int nr_desc; // number of descriptors which is equal to pending transfers -1 - void *completion_ptr; // completion buffer pointer (host memory buffer we poll on for completions) - struct page **page_list; // page structures tracking our pinned pages + struct ndma_eng *eng; // engine + struct ndma_ring *ring; // ring + + // Submission-related + void *host_addr; // host address + dma_addr_t dev_addr; // device address + u64 size; // size for this transfer + bool direction; // direction. true = to-device/write/copy-in + bool last; // last transfer for the entire request. + u64 start_time; // start time for this transfer + int nr_pages; // number of pages for this transfer + int nr_desc; // number of descriptors which is equal to pending transfers -1 + struct page **page_list; // page structures tracking our pinned pages; + // managed by page_list_pool in ctx queue + enum ndma_zcdma_state state; // state of this transfer + + // Completion-related + void *completion_ptr; // completion buffer pointer; + // host memory buffer which driver polls on for completions; + // managed by completion_pool in ctx queue + u64 sequence_num; // async sequence number; 0 for sync transfers + + // Async-only + struct mm_struct *mm; // mm that owns the user buffers }; +static void ndma_zc_release_ctx(struct ndma_h2t_zcdma_context *ctx, u64 *nr_pinned_pages) +{ + // do not free or set completion_ptr null. it is managed by completion_pool in ctx queue + // do not free or set page_list null. it is managed by page_list_pool in ctx queue + + if (ctx->state >= NDMA_PINNED_UNSUBMITTED) { + if (ctx->direction) { + unpin_user_pages(ctx->page_list, ctx->nr_pages); + } else { + unpin_user_pages_dirty_lock(ctx->page_list, ctx->nr_pages, true); + } + *nr_pinned_pages -= ctx->nr_pages; + } + ctx->nr_pages = 0; + + if (ctx->mm) { + mmput(ctx->mm); + ctx->mm = NULL; + } + + ctx->state = NDMA_INVALID; + ctx->sequence_num = 0; +} + +/* H2D DMA Completion Queue (CQ) */ +#define NDMA_H2D_COMPL_QUEUE_CAPACITY 1024 +int ndma_h2d_compl_queue_init(struct neuron_device *nd, struct ndma_h2d_compl_queue *compl_queue) +{ + int ret = 0; + size_t queue_size = 0; + struct mem_chunk *mc = NULL; + neuron_h2d_dma_compl_queue_t *compl_queue_shared = NULL; + + queue_size = sizeof(neuron_h2d_dma_compl_queue_t) + (NDMA_H2D_COMPL_QUEUE_CAPACITY * sizeof(neuron_h2d_dma_compl_queue_entry_t)); + ret = mc_alloc_align(nd, MC_LIFESPAN_DEVICE, queue_size, 0, + MEM_LOC_HOST, 0, 0, 0, + NEURON_MEMALLOC_TYPE_NCDEV_HOST, &mc); + if (ret) { + pr_err("failed to allocate h2d dma completion queue mc: %d\n", ret); + return ret; + } + ret = nmch_handle_alloc(nd, mc, &mc->mc_handle); + if (ret) { + pr_err("failed to allocate mc handle for h2d dma completion queue: %d\n", ret); + mc_free(&mc); + return ret; + } + memset(mc->va, 0, queue_size); + + compl_queue_shared = (neuron_h2d_dma_compl_queue_t *)mc->va; + compl_queue_shared->capacity = NDMA_H2D_COMPL_QUEUE_CAPACITY; + compl_queue_shared->head = 0; + compl_queue_shared->tail = 0; -#define NDMA_ZC_PAGES_PER_XFER 64 // number of pages in each zero copy dma transfer. This is somewhat, but not + compl_queue->mc = mc; + compl_queue->compl_queue_shared = compl_queue_shared; + compl_queue->capacity_mask = NDMA_H2D_COMPL_QUEUE_CAPACITY - 1; + compl_queue->tail = 0; + + return 0; +} + +void ndma_h2d_compl_queue_destroy(struct ndma_h2d_compl_queue *compl_queue) +{ + if (compl_queue->mc) { + mc_free(&compl_queue->mc); + } + compl_queue->mc = NULL; + compl_queue->compl_queue_shared = NULL; + compl_queue->capacity_mask = 0; + compl_queue->tail = 0; +} + +static void ndma_h2d_compl_queue_put(struct ndma_h2d_compl_queue *compl_queue, + u64 sequence_num, + s64 compl_ret, + void *context) +{ + u32 head = 0; + u32 tail = 0; + neuron_h2d_dma_compl_queue_t *compl_queue_shared = compl_queue->compl_queue_shared; + neuron_h2d_dma_compl_queue_entry_t *entry = NULL; + + head = smp_load_acquire(&compl_queue_shared->head); + tail = compl_queue->tail; + + while ((tail - head) >= (compl_queue->capacity_mask + 1)) { + pr_warn_once("h2d dma completion queue full; blocking until space is available\n"); + msleep(1); + head = smp_load_acquire(&compl_queue_shared->head); + tail = compl_queue->tail; + } + + entry = &compl_queue_shared->entries[tail & compl_queue->capacity_mask]; + + /* Write completion result to tail */ + entry->compl_ret = compl_ret; + entry->context = context; + entry->sequence_num = sequence_num; + + /* Move tail */ + compl_queue->tail = tail + 1; + smp_store_release(&compl_queue_shared->tail, compl_queue->tail); +} + +#define NDMA_ZC_PAGES_PER_XFER 64 // number of pages in each zero copy dma transfer. This is somewhat, but not // totally arbitrary. We don't want to pin a lot of pages. We just want to // pin enough where (approximately): // dma time > (pin time + setup time + completion update + initial poll wait) // That's the simple explanation. It's a tad more complicated in trading off smaller // transfers where even if that equation doesn't hold, the overlap can be beneficial. // Right now the sweet spot looks to be ~ 64 pages. More tuning is required. - // #define NDMA_ZC_MIN_PAGES_PER_XFER 64 +/* Hysteresis thresholds for descriptor wait checks in submission flow. */ +#define NDMA_ZC_DESC_WAIT_THRESHOLD_LO (NDMA_ZC_PAGES_PER_XFER + 1) +#define NDMA_ZC_DESC_WAIT_THRESHOLD_HI (NDMA_ZC_DESC_WAIT_THRESHOLD_LO * 8) + +/* DMA ctx queue constants */ +#define NDMA_CTX_QUEUE_DEFAULT_CAPACITY 1024 +#define NDMA_CTX_QUEUE_MAX_PINNED_PAGES 524288 + +/* Skip tombstone ctxs */ +static void ndma_ctx_queue_advance_to_valid(struct ndma_ctx_queue *queue, u32 *idx, u32 stop) +{ + while (*idx != stop && queue->entries[*idx].state == NDMA_INVALID) { + *idx = (*idx + 1) & queue->capacity_mask; + } +} + +/* Check empty or full */ +static bool ndma_ctx_queue_is_empty(const struct ndma_ctx_queue *queue) +{ + return queue->head == queue->tail; +} + +static bool ndma_ctx_queue_is_full(const struct ndma_ctx_queue *queue) +{ + return queue->head == ((queue->tail + 1) & queue->capacity_mask); +} + +static bool ndma_ctx_queue_submitted_empty(const struct ndma_ctx_queue *queue) +{ + return queue->head == queue->first_pinned_unsubmitted; +} + +static bool ndma_ctx_queue_pinned_unsubmitted_empty(const struct ndma_ctx_queue *queue) +{ + return queue->first_pinned_unsubmitted == queue->first_unpinned; +} + +static bool ndma_ctx_queue_unpinned_empty(const struct ndma_ctx_queue *queue) +{ + return queue->first_unpinned == queue->tail; +} + +/* Increment to next index */ +static void ndma_ctx_queue_inc_first_pinned_unsubmitted(struct ndma_ctx_queue *queue) +{ + if (ndma_ctx_queue_pinned_unsubmitted_empty(queue)) { + return; + } + queue->first_pinned_unsubmitted = (queue->first_pinned_unsubmitted + 1) & queue->capacity_mask; + ndma_ctx_queue_advance_to_valid(queue, &queue->first_pinned_unsubmitted, queue->first_unpinned); +} + +static void ndma_ctx_queue_inc_first_unpinned(struct ndma_ctx_queue *queue) +{ + if (ndma_ctx_queue_unpinned_empty(queue)) { + return; + } + queue->first_unpinned = (queue->first_unpinned + 1) & queue->capacity_mask; + ndma_ctx_queue_advance_to_valid(queue, &queue->first_unpinned, queue->tail); +} + +static void ndma_ctx_queue_inc_tail(struct ndma_ctx_queue *queue) +{ + u32 old_tail = queue->tail; + u32 new_tail = (old_tail + 1) & queue->capacity_mask; + + // Assume the ctx at old tail is already filled by caller + // Tail advance may also initialize/advance the pinned+unsubmitted and unpinned pointers + struct ndma_h2t_zcdma_context *ctx = &queue->entries[old_tail]; + if (ctx->state == NDMA_PINNED_UNSUBMITTED) { + if (ndma_ctx_queue_pinned_unsubmitted_empty(queue)) { + // The first pinned+unsubmitted pointer appears at old_tail + queue->first_pinned_unsubmitted = old_tail; + } + if (ndma_ctx_queue_unpinned_empty(queue)) { + // No unpinned elements yet; start after the new tail + queue->first_unpinned = new_tail; + } + } else if (ctx->state == NDMA_UNPINNED) { + if (ndma_ctx_queue_unpinned_empty(queue)) { + // The first unpinned pointer appears at old_tail + queue->first_unpinned = old_tail; + } + } + + // Move tail forward after updating the two pointers + queue->tail = new_tail; +} + +/* Peek */ +static struct ndma_h2t_zcdma_context *ndma_ctx_queue_peek_tail(struct ndma_ctx_queue *queue) +{ + if (ndma_ctx_queue_is_full(queue)) { + return NULL; + } + return &queue->entries[queue->tail]; +} + +static struct ndma_h2t_zcdma_context *ndma_ctx_queue_peek_pinned_unsubmitted(struct ndma_ctx_queue *queue) +{ + if (ndma_ctx_queue_pinned_unsubmitted_empty(queue)) { + return NULL; + } + return &queue->entries[queue->first_pinned_unsubmitted]; +} + +static struct ndma_h2t_zcdma_context *ndma_ctx_queue_peek_first_unpinned(struct ndma_ctx_queue *queue) +{ + if (ndma_ctx_queue_unpinned_empty(queue)) { + return NULL; + } + return &queue->entries[queue->first_unpinned]; +} + +/* Pop */ +static struct ndma_h2t_zcdma_context *ndma_ctx_queue_pop_head(struct ndma_ctx_queue *queue) +{ + u32 old_head; + struct ndma_h2t_zcdma_context *ctx = NULL; + + if (ndma_ctx_queue_is_empty(queue)) { + return NULL; + } + + old_head = queue->head; + ctx = &queue->entries[old_head]; + queue->head = (queue->head + 1) & queue->capacity_mask; + ndma_ctx_queue_advance_to_valid(queue, &queue->head, queue->tail); + + if (ndma_ctx_queue_is_empty(queue)) { + queue->first_pinned_unsubmitted = queue->tail; + queue->first_unpinned = queue->tail; + } else { + if (old_head == queue->first_pinned_unsubmitted) { + ndma_ctx_queue_inc_first_pinned_unsubmitted(queue); + } + if (old_head == queue->first_unpinned) { + ndma_ctx_queue_inc_first_unpinned(queue); + } + } + + return ctx; +} + +static struct ndma_h2t_zcdma_context *ndma_ctx_queue_pop_submitted(struct ndma_ctx_queue *queue) +{ + if (ndma_ctx_queue_submitted_empty(queue)) { + return NULL; + } + + return ndma_ctx_queue_pop_head(queue); +} + +/* Failure-path helper. + * Given a sequence number of a async request, wait for any matching submitted ctxs, then reset all matching ctxs. + * This prevents further remote pinning and submitting on a failed async request. + * Mostly used in failure and cleanup paths, so don't stop on failed DMAs. + */ +static void ndma_ctx_queue_drain_sequence(struct ndma_ctx_queue *queue, u64 sequence_num) +{ + u32 idx; + + for (idx = queue->head; idx != queue->tail; idx = (idx + 1) & queue->capacity_mask) { + struct ndma_h2t_zcdma_context *ctx = &queue->entries[idx]; + + if (ctx->sequence_num == sequence_num) { + // wait for already submitted DMAs to complete. + if (ctx->state == NDMA_SUBMITTED) { + ndma_memcpy_wait_for_completion(ctx->eng, ctx->ring, ctx->nr_desc + 1, ctx->completion_ptr, false, false); + } + + // release pinned pages and mm, and set state to invalid (tombstone). + ndma_zc_release_ctx(ctx, &queue->nr_pinned_pages); + } + } + + // After draining, advance the pointers to skip the invalidated ctxs. + ndma_ctx_queue_advance_to_valid(queue, &queue->first_unpinned, queue->tail); + ndma_ctx_queue_advance_to_valid(queue, &queue->first_pinned_unsubmitted, queue->first_unpinned); + ndma_ctx_queue_advance_to_valid(queue, &queue->head, queue->tail); +} + +/* Failure-path helper. + * Wait for submitted contexts from head up to (but not including) first_pinned_unsubmitted. + * Unpin from head up to (but not including) first_unpinned. + * Mostly used in failure and cleanup paths, so don't stop on failed DMAs. + */ +static void ndma_ctx_queue_drain(struct ndma_eng *eng, + struct ndma_ring *ring, + struct ndma_ctx_queue *queue) +{ + while (!ndma_ctx_queue_is_empty(queue)) { + struct ndma_h2t_zcdma_context *ctx = ndma_ctx_queue_pop_head(queue); + + if (ctx->state == NDMA_SUBMITTED) { + ndma_memcpy_wait_for_completion(eng, ring, ctx->nr_desc + 1, ctx->completion_ptr, false, false); + } + + ndma_zc_release_ctx(ctx, &queue->nr_pinned_pages); + } +} + +/* Init and destroy queue */ +int ndma_ctx_queue_init(struct ndma_ctx_queue *queue) +{ + int i; + + if (!queue) { + pr_err("ctx queue pointer cannot be NULL\n"); + return -EINVAL; + } + + memset(queue, 0, sizeof(*queue)); + + u32 capacity = NDMA_CTX_QUEUE_DEFAULT_CAPACITY; + if (!is_power_of_2(capacity)) { + pr_err("ctx queue capacity must be power of two\n"); + return -EINVAL; + } + queue->capacity_mask = capacity - 1; + queue->head = 0; + queue->tail = 0; + queue->first_pinned_unsubmitted = 0; + queue->first_unpinned = 0; + + queue->entries = kvcalloc(capacity, sizeof(*queue->entries), GFP_KERNEL); + if (!queue->entries) { + pr_err("failed to allocate ctx queue entries\n"); + return -ENOMEM; + } + + // allocate completion ptrs in one contiguous array at once, + // and let queue->entries[i].completion_ptr point to each completion buffer + queue->completion_pool = kcalloc(capacity, DMA_COMPLETION_MARKER_SIZE * 2, GFP_KERNEL); + if (!queue->completion_pool) { + pr_err("failed to allocate ctx queue completion pool\n"); + goto err; + } + + // allocate page_list arrays in one contiguous pool, and let each entry point to its slice + queue->page_list_pool = kcalloc(capacity * NDMA_ZC_PAGES_PER_XFER, sizeof(struct page *), GFP_KERNEL); + if (!queue->page_list_pool) { + pr_err("failed to allocate ctx queue page_list pool\n"); + goto err; + } + + for (i = 0; i < capacity; i++) { + queue->entries[i].completion_ptr = + (u8 *)queue->completion_pool + i * DMA_COMPLETION_MARKER_SIZE * 2; + queue->entries[i].page_list = + (struct page **)queue->page_list_pool + i * NDMA_ZC_PAGES_PER_XFER; + } + + return 0; + +err: + if (queue->completion_pool) { + kfree(queue->completion_pool); + queue->completion_pool = NULL; + } + if (queue->page_list_pool) { + kfree(queue->page_list_pool); + queue->page_list_pool = NULL; + } + if (queue->entries) { + kvfree(queue->entries); + queue->entries = NULL; + } + return -ENOMEM; +} + +void ndma_ctx_queue_free(struct ndma_eng *eng, struct ndma_ring *ring, struct ndma_ctx_queue *queue) +{ + if (!queue) { + return; + } + if (queue->entries) { + ndma_ctx_queue_drain(eng, ring, queue); + kvfree(queue->entries); + queue->entries = NULL; + } + if (queue->completion_pool) { + kfree(queue->completion_pool); + queue->completion_pool = NULL; + } + if (queue->page_list_pool) { + kfree(queue->page_list_pool); + queue->page_list_pool = NULL; + } + memset(queue, 0, sizeof(*queue)); +} + /** ndma_calc_zc_pin_size() * * determine how many pages to pin per step for zercopy dma pipelining. @@ -960,7 +1390,7 @@ bool ndma_zerocopy_supported(void) * Think about using some permanent location in HBM as source for completion descriptor update. Like * why are we reading across the PCIe bus to fetch completion data. */ -static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx) +static int ndma_build_n_issue_zc_descs(struct ndma_h2t_zcdma_context * dma_ctx) { int ret; unsigned long offset = (unsigned long)(dma_ctx->host_addr) & (PAGE_SIZE-1); @@ -970,7 +1400,7 @@ static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx) int i = 0; u64 chunk_size; int pending_transfers = 0; - int barrier_type; + int barrier_type; while (i < dma_ctx->nr_pages) { dma_addr_t src_addr; @@ -1035,7 +1465,7 @@ static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx) pending_transfers++; } } - + dma_ctx->nr_desc = pending_transfers; if (narch_get_arch() != NEURON_ARCH_V2) @@ -1050,202 +1480,387 @@ static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx) pending_transfers++; ret = udma_m2m_copy_start(&dma_ctx->eng->udma, dma_ctx->ring->qid, pending_transfers, pending_transfers); - if (ret) { pr_info("copy start failed %d\n", ret); } + dma_ctx->state = NDMA_SUBMITTED; error: return ret; } -/** - * ndma_zerocopy_wait_for_completion() - * - * - * - */ -static int ndma_zerocopy_wait_for_completion( struct neuron_device *nd, u32 nc_id, struct ndma_eng *eng, struct ndma_ring *ring, - struct ndma_h2t_zcdma_context * dma_ctx, struct ndma_h2t_zcdma_context * ndma_ctx) +/* Return the number of descriptors available (TX-only; TX/RX counts match) */ +static u32 ndma_zc_descs_available(struct ndma_eng *eng, u32 qid) { - int ret; + struct udma_q *txq; + u32 tx_desc_available; - ret = ndma_memcpy_wait_for_completion(eng, ring, dma_ctx->nr_desc+1, dma_ctx->completion_ptr, true, false); - //atomic_sub(dma_ctx->nr_desc+1, &dma_ctx->ring->h2t_outstanding_desc); + udma_q_handle_get(&eng->udma, qid, UDMA_TX, &txq); - if (ret == 0) { - if (dma_ctx->direction) - unpin_user_pages(dma_ctx->page_list, dma_ctx->nr_pages); - else - unpin_user_pages_dirty_lock(dma_ctx->page_list, dma_ctx->nr_pages, true); - return ret; - } + tx_desc_available = udma_available_get(txq); - // If we are exiting here, we've failed so unpin pages associated with the DMA. If the next DMA - // context is valid, do an obligatory wait for the DMA operation so we don't splat data on someone - // else's memory just in case the physical pages are reassigned after unpinning. - // - unpin_user_pages(dma_ctx->page_list, dma_ctx->nr_pages); + /* TX/RX descriptor availability is kept in lock-step. */ + return tx_desc_available; +} - // blindly wait - if (ndma_ctx != NULL) { - ndma_memcpy_wait_for_completion(eng, ring, ndma_ctx->nr_desc+1, ndma_ctx->completion_ptr, false, false); - unpin_user_pages(ndma_ctx->page_list, ndma_ctx->nr_pages); +/* Estimate if a zero-copy DMA context fits in the available descriptors. */ +static bool _ndma_zc_descs_available(struct ndma_eng *eng, u32 qid, u32 threshold) +{ + u32 max_descs_required = threshold + 1; /* +1 for completion descriptor */ + + return ndma_zc_descs_available(eng, qid) >= max_descs_required; +} + +/* Whether we should wait for some completions before submitting more in the next iteration */ +static bool ndma_zc_should_wait(struct ndma_eng *eng, + struct ndma_ring *ring, + struct ndma_ctx_queue *ctx_queue, + u32 *desc_threshold) +{ + bool pinned_at_max; + bool desc_ring_full; + bool ctx_queue_full; + + pinned_at_max = ctx_queue->nr_pinned_pages >= NDMA_CTX_QUEUE_MAX_PINNED_PAGES; + ctx_queue_full = ndma_ctx_queue_is_full(ctx_queue); + desc_ring_full = !_ndma_zc_descs_available(eng, ring->qid, *desc_threshold); + + if (pinned_at_max || desc_ring_full || ctx_queue_full) { + *desc_threshold = NDMA_ZC_DESC_WAIT_THRESHOLD_HI; + return true; } - - return ret; + + return false; } -int ndma_memcpy_zerocopy(struct neuron_device *nd, - u32 nc_id, - const nrt_tensor_batch_op_t *ops, - u32 num_ops, - dma_addr_t dev_base, - int qid, - bool direction) +static int ndma_zerocopy_pin_pages(int nd_id, + u32 nc_id, + struct ndma_ctx_queue *ctx_queue, + struct ndma_h2t_zcdma_context *dma_ctx, + bool use_remote_pin) { - int ret = 0; - const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_id); - struct ndma_eng *eng = &nd->ndma_engine[eng_id]; - struct ndma_queue *queue = &eng->queues[qid]; - struct ndma_ring *ring = &queue->ring_info; - struct ndma_h2t_zcdma_context dma_ctx_tbl[2] = {0}; - struct ndma_h2t_zcdma_context *pdma_ctx = NULL; - int next_dma_idx = 0; - int i = 0; - bool locked = false; - - // sanity check ring is owned by nc_id - if (!ndmar_h2t_ring_is_owner(ring, nc_id)) { - pr_err("nd%02d: attempting to use qid %d that was not assigned to nc %d\n", nd->device_index, qid, nc_id); - return -ENOENT; - } - - // initialize the static fields in the dma contexts that are the same for every operation - for (i=0;i< 2;i++) { - dma_ctx_tbl[i].eng = eng; - dma_ctx_tbl[i].ring = ring; - dma_ctx_tbl[i].direction = direction; - dma_ctx_tbl[i].page_list = kcalloc( NDMA_ZC_PAGES_PER_XFER, sizeof(struct page *), GFP_KERNEL); - dma_ctx_tbl[i].completion_ptr = kmalloc(DMA_COMPLETION_MARKER_SIZE * 2, GFP_KERNEL); - - if ((dma_ctx_tbl[i].page_list == NULL) || (dma_ctx_tbl[i].completion_ptr == NULL)) { - pr_err("could not allocate memory for dma contexts on nd %d\n", nd->device_index); - ret = -ENOMEM; - goto fail; - } - } - pdma_ctx = NULL; - - mutex_lock(&ring->h2t_ring_lock); - locked = true; - - // Process all operations with pipelining - for (i = 0; i < num_ops; i++) { - const nrt_tensor_batch_op_t *op = &ops[i]; - u64 remaining = op->size; - void *host_addr = op->buffer; - dma_addr_t dev_addr = dev_base + op->offset; - u64 offset = (unsigned long)host_addr & (PAGE_SIZE - 1); - u64 pin_size = ndma_calc_zc_pin_size(op->size + offset); // pin size is in page units, so include the page offset in size calc - - while (remaining) { - struct ndma_h2t_zcdma_context *dma_ctx = &dma_ctx_tbl[next_dma_idx]; - dma_ctx->start_time = get_jiffies_64(); - dma_ctx->host_addr = host_addr; - dma_ctx->dev_addr = dev_addr; - dma_ctx->size = pin_size - offset; // first chunk might not be aligned on the page boundary, all subsequent chunk will be aligned - // and the offset will be 0 - dma_ctx->last = (dma_ctx->size == remaining && i == num_ops - 1); - dma_ctx->nr_pages = DIV_ROUND_UP(pin_size, PAGE_SIZE); - if (dma_ctx->nr_pages > NDMA_ZC_PAGES_PER_XFER) { - pr_err_once("page count too large: %u\n", dma_ctx->nr_pages); - } - - //__GFP_SKIP_ZERO - int nr_pinned = pin_user_pages_fast((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, - direction ? 0 : FOLL_WRITE, dma_ctx->page_list); - if (nr_pinned != dma_ctx->nr_pages) { - // if failed pin_fast because of page fault, do the regular pinning - if (nr_pinned > 0) { - unpin_user_pages( dma_ctx->page_list, nr_pinned); - } + int nr_pinned = 0; + if (use_remote_pin) { + if (!dma_ctx->mm) { + pr_err("remote pin requested without mm context\n"); + return -EINVAL; + } #if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 5, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 6))) - nr_pinned = pin_user_pages((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, direction ? 0 : FOLL_WRITE, dma_ctx->page_list); + nr_pinned = pin_user_pages_remote(dma_ctx->mm, + (unsigned long)dma_ctx->host_addr & PAGE_MASK, + dma_ctx->nr_pages, + dma_ctx->direction ? 0 : FOLL_WRITE, + dma_ctx->page_list, + NULL); #else - nr_pinned = pin_user_pages((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, direction ? 0 : FOLL_WRITE, dma_ctx->page_list, NULL); + nr_pinned = pin_user_pages_remote(dma_ctx->mm, + (unsigned long)dma_ctx->host_addr & PAGE_MASK, + dma_ctx->nr_pages, + dma_ctx->direction ? 0 : FOLL_WRITE, + dma_ctx->page_list, + NULL, + NULL); #endif - if (nr_pinned != dma_ctx->nr_pages) { - ret = -ENOMEM; // could use -EBUSY instead - pr_err("could not pin host pages for zero copy dma on nd %d: nr_pinned %d\n", nd->device_index, nr_pinned); - - if (nr_pinned > 0) { - unpin_user_pages( dma_ctx->page_list, nr_pinned); - } - // cleanup: wait for prev dma to complete (which also unpins pages) - if (pdma_ctx != NULL) { - ndma_zerocopy_wait_for_completion( nd, nc_id, eng, ring, pdma_ctx, NULL); - } - goto fail; - } - } - - // TODO need to have this for other architectures - // for (i=0; i < dma_ctx->nr_pages; i++) { - // struct device - // dma_ctx->addr[i] = dma_map_page( nd->pdev->dev, dma_ctx_page_list[i], 0, PAGE_SIZE, DMA_TO_DEVICE/DMA_FROM_DEVICE); - // ret = dma_mapping_error(dev->dev, dma_ctx->addr[i]); - // if (ret) { } - // } - // flush_cache_range(vma, - - ret = ndma_build_n_issue_zc_descs(dma_ctx); - if (ret) { - unpin_user_pages( dma_ctx->page_list, dma_ctx->nr_pages); - // cleanup: wait for prev dma to complete (which also unpins pages) - if (pdma_ctx != NULL) { - ndma_zerocopy_wait_for_completion(nd, nc_id, eng, ring, pdma_ctx, NULL); - } - goto fail; - } - - if (pdma_ctx != NULL) { - ret = ndma_zerocopy_wait_for_completion(nd, nc_id, eng, ring, pdma_ctx, dma_ctx); - if (ret) { - goto fail; - } - } - - pdma_ctx = dma_ctx; - next_dma_idx = (next_dma_idx+1) % 2; - - remaining -= dma_ctx->size; - host_addr += dma_ctx->size; - dev_addr += dma_ctx->size; - pin_size = (remaining < pin_size) ? remaining : pin_size; - offset = 0; - } - } - - - // Wait for the last chunk - if (pdma_ctx) { - ret = ndma_zerocopy_wait_for_completion( nd, nc_id, eng, ring, pdma_ctx, NULL); - } + mmput(dma_ctx->mm); + dma_ctx->mm = NULL; + } else { + nr_pinned = pin_user_pages_fast((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, + dma_ctx->direction ? 0 : FOLL_WRITE, dma_ctx->page_list); -fail: - // release resources - for (i = 0; i < 2; i++) { - if (dma_ctx_tbl[i].page_list != NULL) - kfree(dma_ctx_tbl[i].page_list); - if (dma_ctx_tbl[i].completion_ptr != NULL) { - kfree(dma_ctx_tbl[i].completion_ptr); - } - } - if (locked) { - mutex_unlock(&ring->h2t_ring_lock); - } - - return ret; + if (nr_pinned != dma_ctx->nr_pages) { + // if failed pin_fast because of page fault, do the regular pinning + if (nr_pinned > 0) { + unpin_user_pages(dma_ctx->page_list, nr_pinned); + } + +#if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 5, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 6))) + nr_pinned = pin_user_pages((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, dma_ctx->direction ? 0 : FOLL_WRITE, dma_ctx->page_list); +#else + nr_pinned = pin_user_pages((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, dma_ctx->direction ? 0 : FOLL_WRITE, dma_ctx->page_list, NULL); +#endif + } + } + + if (nr_pinned != dma_ctx->nr_pages) { + int err = (nr_pinned < 0) ? nr_pinned : -ENOMEM; + + pr_err("could not pin host pages for zero copy dma on nd %d: nr_pinned %d\n", nd_id, nr_pinned); + + if (nr_pinned > 0) { + unpin_user_pages(dma_ctx->page_list, nr_pinned); + } + + return err; + } + + ctx_queue->nr_pinned_pages += dma_ctx->nr_pages; + dma_ctx->state = NDMA_PINNED_UNSUBMITTED; + + return 0; +} + +int ndma_zerocopy_submit(struct neuron_device *nd, + u32 nc_id, + const nrt_tensor_batch_op_t *ops, + u32 num_ops, + dma_addr_t dev_base, + int qid, + bool direction, + u64 sequence_num) +{ + int ret = 0; + int i = 0; + const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_id); + struct ndma_eng *eng = &nd->ndma_engine[eng_id]; + struct ndma_queue *queue = &eng->queues[qid]; + struct ndma_ring *ring = &queue->ring_info; + struct ndma_ctx_queue *ctx_queue = &ring->dma_ctx_queue; + struct ndma_h2t_zcdma_context *cur_ctx = NULL; + struct ndma_h2t_zcdma_op_context op_ctx; + bool async = (sequence_num != 0); + + /* Verify ring ownership. */ + if (!ndmar_h2t_ring_is_owner(ring, nc_id)) { + pr_err("nd%02d: attempting to use qid %d that was not assigned to nc %d\n", + nd->device_index, qid, nc_id); + return -ENOENT; + } + + mutex_lock(&ring->h2t_ring_lock); + + for (i = 0; i < num_ops; i++) { + const nrt_tensor_batch_op_t *op = &ops[i]; + op_ctx.host_addr = op->buffer; + op_ctx.dev_addr = dev_base + op->offset; + op_ctx.offset = (unsigned long)op_ctx.host_addr & (PAGE_SIZE - 1); + op_ctx.remaining = op->size; + /* pin_size is in page units; include the page offset. */ + op_ctx.pin_size = ndma_calc_zc_pin_size(op_ctx.remaining + op_ctx.offset); + + while (op_ctx.remaining) { + int nr_pages; + bool can_pin; + bool ctx_queue_full; + + /* Step 1: submit any pinned contexts that have available descriptors. */ + while (true) { + struct ndma_h2t_zcdma_context *unsubmitted_ctx = ndma_ctx_queue_peek_pinned_unsubmitted(ctx_queue); + + if (!unsubmitted_ctx || !_ndma_zc_descs_available(eng, ring->qid, unsubmitted_ctx->nr_pages)) { + break; + } + + ret = ndma_build_n_issue_zc_descs(unsubmitted_ctx); + if (ret) { + pr_err("failed to build and issue zero-copy descs\n"); + goto done; + } + + ndma_ctx_queue_inc_first_pinned_unsubmitted(ctx_queue); + } + + /* Step 2: set up the current ctx if there is room and pinned-page budget. */ + nr_pages = DIV_ROUND_UP(op_ctx.pin_size, PAGE_SIZE); + can_pin = (ctx_queue->nr_pinned_pages + nr_pages <= NDMA_CTX_QUEUE_MAX_PINNED_PAGES); + ctx_queue_full = ndma_ctx_queue_is_full(ctx_queue); + + if (async && ctx_queue_full) { + pr_err("ctx queue full. failed to submit async ctx\n"); + ret = -EBUSY; + goto done; + } + + if ((can_pin || async) && !ctx_queue_full) { + cur_ctx = ndma_ctx_queue_peek_tail(ctx_queue); + cur_ctx->eng = eng; + cur_ctx->ring = ring; + cur_ctx->host_addr = op_ctx.host_addr; + cur_ctx->dev_addr = op_ctx.dev_addr; + // First chunk may be unaligned; later chunks are page-aligned with offset=0. + cur_ctx->size = op_ctx.pin_size - op_ctx.offset; + cur_ctx->direction = direction; + cur_ctx->last = (cur_ctx->size == op_ctx.remaining && i == num_ops - 1); + cur_ctx->nr_pages = nr_pages; + cur_ctx->state = NDMA_UNPINNED; + cur_ctx->nr_desc = 0; // Set by ndma_build_n_issue_zc_descs(). + cur_ctx->mm = NULL; + cur_ctx->sequence_num = sequence_num; + + /* Pin now if possible; otherwise capture mm for remote pinning (async only). */ + if (can_pin) { + ret = ndma_zerocopy_pin_pages(nd->device_index, nc_id, ctx_queue, cur_ctx, false); + if (ret) { + pr_err("failed to pin pages for zero copy dma on nd %d\n", nd->device_index); + goto done; + } + } else if (async) { + struct mm_struct *mm = current->mm; + mmget(mm); + cur_ctx->mm = mm; + } + + /* Advance the queue tail. + * May also initialize/advance the pinned+unsubmitted and unpinned pointers. + */ + ndma_ctx_queue_inc_tail(ctx_queue); + + /* Update loop variables for the next chunk. */ + op_ctx.remaining -= cur_ctx->size; + op_ctx.host_addr += cur_ctx->size; + op_ctx.dev_addr += cur_ctx->size; + op_ctx.pin_size = (op_ctx.remaining < op_ctx.pin_size) ? op_ctx.remaining : op_ctx.pin_size; + op_ctx.offset = 0; + cur_ctx = NULL; + } + + /* Step 3 (sync): wait for submitted transfers to complete from the head. */ + if (!async) { + u32 desc_threshold = NDMA_ZC_DESC_WAIT_THRESHOLD_LO; + + while (ndma_zc_should_wait(eng, ring, ctx_queue, &desc_threshold)) { + struct ndma_h2t_zcdma_context *submitted_ctx = ndma_ctx_queue_pop_submitted(ctx_queue); + + ret = ndma_memcpy_wait_for_completion(eng, ring, submitted_ctx->nr_desc + 1, + submitted_ctx->completion_ptr, + true, false); + ndma_zc_release_ctx(submitted_ctx, &ctx_queue->nr_pinned_pages); + if (ret) { + pr_err("failed to wait for completion of zero copy dma\n"); + goto done; + } + } + } + } + } + + if (!async) { + /* Step 4 (sync): submit remaining pinned ctxs, then drain all submitted ctxs. */ + while (true) { + struct ndma_h2t_zcdma_context *ctx_to_submit = ndma_ctx_queue_peek_pinned_unsubmitted(ctx_queue); + + if (ctx_to_submit && _ndma_zc_descs_available(eng, ring->qid, ctx_to_submit->nr_pages)) { + ret = ndma_build_n_issue_zc_descs(ctx_to_submit); + if (ret) { + pr_err("failed to build and issue zero-copy descs\n"); + goto done; + } + ndma_ctx_queue_inc_first_pinned_unsubmitted(ctx_queue); + } + + struct ndma_h2t_zcdma_context *ctx_to_wait = ndma_ctx_queue_pop_submitted(ctx_queue); + if (ctx_to_wait) { + ret = ndma_memcpy_wait_for_completion(eng, ring, ctx_to_wait->nr_desc + 1, + ctx_to_wait->completion_ptr, + true, false); + ndma_zc_release_ctx(ctx_to_wait, &ctx_queue->nr_pinned_pages); + if (ret) { + pr_err("failed to wait for completion of zero copy dma\n"); + goto done; + } + } + + if (!ctx_to_submit && !ctx_to_wait) { + break; + } + } + } + +done: + if (ret) { + ndma_ctx_queue_drain(eng, ring, ctx_queue); + } + mutex_unlock(&ring->h2t_ring_lock); + return ret; +} + +/* The completion flow for completion, remote pinning, and submission. Async IO only */ +static __maybe_unused int ndma_zerocopy_complete(struct neuron_device *nd, + struct ndma_eng *eng, + struct ndma_ring *ring, + bool *did_work) +{ + int ret = 0; + int err = 0; + struct ndma_ctx_queue *ctx_queue = NULL; + u32 desc_threshold = NDMA_ZC_DESC_WAIT_THRESHOLD_LO; + + if (!ring || !did_work) { + return -EINVAL; + } + *did_work = false; + + ctx_queue = &ring->dma_ctx_queue; + + mutex_lock(&ring->h2t_ring_lock); + + /* 1) Wait for at least one submitted context to complete */ + while (true) { + if (ndma_ctx_queue_submitted_empty(ctx_queue)) { + break; + } + if (*did_work && !ndma_zc_should_wait(eng, ring, ctx_queue, &desc_threshold)) { + break; + } + struct ndma_h2t_zcdma_context *submitted_ctx = ndma_ctx_queue_pop_submitted(ctx_queue); + + ret = ndma_memcpy_wait_for_completion(eng, ring, submitted_ctx->nr_desc + 1, submitted_ctx->completion_ptr, true, false); + if (ret) { + err = ret; + pr_err("async h2d dma completion failed for seq num %llu: %d\n", submitted_ctx->sequence_num, ret); + ndma_h2d_compl_queue_put(&ring->dma_compl_queue, submitted_ctx->sequence_num, ret, NULL); + ndma_ctx_queue_drain_sequence(ctx_queue, submitted_ctx->sequence_num); + } else if (submitted_ctx->last) { + ndma_h2d_compl_queue_put(&ring->dma_compl_queue, submitted_ctx->sequence_num, 0, NULL); + } + + ndma_zc_release_ctx(submitted_ctx, &ctx_queue->nr_pinned_pages); + + *did_work = true; + } + + /* 2) Submit pinned but unsubmitted contexts */ + while (true) { + struct ndma_h2t_zcdma_context *pinned_unsubmitted_ctx = ndma_ctx_queue_peek_pinned_unsubmitted(ctx_queue); + + if (!pinned_unsubmitted_ctx || !_ndma_zc_descs_available(eng, ring->qid, pinned_unsubmitted_ctx->nr_pages)) { + break; + } + + ret = ndma_build_n_issue_zc_descs(pinned_unsubmitted_ctx); + if (ret) { + err = ret; + pr_err("async h2d dma submission failed for seq num %llu: %d\n", pinned_unsubmitted_ctx->sequence_num, ret); + ndma_h2d_compl_queue_put(&ring->dma_compl_queue, pinned_unsubmitted_ctx->sequence_num, ret, NULL); + ndma_ctx_queue_drain_sequence(ctx_queue, pinned_unsubmitted_ctx->sequence_num); + } else { + ndma_ctx_queue_inc_first_pinned_unsubmitted(ctx_queue); + } + + *did_work = true; + } + + /* 3) Remote pin unpinned contexts */ + while (true) { + struct ndma_h2t_zcdma_context *unpinned_ctx = ndma_ctx_queue_peek_first_unpinned(ctx_queue); + + if (!unpinned_ctx || ctx_queue->nr_pinned_pages + unpinned_ctx->nr_pages > NDMA_CTX_QUEUE_MAX_PINNED_PAGES) { + break; + } + + ret = ndma_zerocopy_pin_pages(nd->device_index, ring->h2t_nc_id, ctx_queue, unpinned_ctx, true); + if (ret) { + err = ret; + pr_err("async h2d dma remote pinning failed for seq num %llu: %d\n", unpinned_ctx->sequence_num, ret); + ndma_h2d_compl_queue_put(&ring->dma_compl_queue, unpinned_ctx->sequence_num, ret, NULL); + ndma_ctx_queue_drain_sequence(ctx_queue, unpinned_ctx->sequence_num); + } else { + ndma_ctx_queue_inc_first_unpinned(ctx_queue); + } + + *did_work = true; + } + + mutex_unlock(&ring->h2t_ring_lock); + return err; } diff --git a/neuron_dma.h b/neuron_dma.h index 012bef4..eeea14f 100644 --- a/neuron_dma.h +++ b/neuron_dma.h @@ -6,6 +6,8 @@ #ifndef NEURON_DMA_H #define NEURON_DMA_H +#include + #include "udma/udma.h" #include "neuron_mempool.h" @@ -200,7 +202,7 @@ dma_addr_t ndma_mc_to_pa(struct mem_chunk *mc); bool ndma_zerocopy_supported(void); /** - * ndma_memcpy_zerocopy - Perform a pipelined zero-copy DMA transfer. + * ndma_zerocopy_submit() - Perform a pipelined zero-copy DMA transfer. * @nd: Neuron device whose DMA engine is used. * @nc_id: Neuron core identifier owning the queue. * @ops: Array of host buffer descriptors. @@ -208,6 +210,7 @@ bool ndma_zerocopy_supported(void); * @dev_base: Base device physical address for the transfer. * @qid: Queue identifier to submit descriptors on. * @direction: true for host-to-device, false for device-to-host. + * @sequence_num: sequence number under async submission; 0 for sync. * * DMA data between a user space virtual address range and a contiguous location in device memory. * In order to do this, we need to know the physical pages are associated with @@ -222,26 +225,25 @@ bool ndma_zerocopy_supported(void); * We use pin_user_pages_fast() to reduce pinning overhead because we know the process can't go * away while we are down here doing our thing in the kernel within a single IOCTL call. * - * We ping pong back and forth between two dma contexts. So while dma for context A is in progress, - * we are pinning pages and starting dmas for context B. - * - * Algorithm goes like this: - * initial a pair of dma contexts - * prev dma ctx = null + * ## For sync mode ## + * We enqueue DMA contexts into a fixed-size queue and drive submission from that queue. + * The loop: * lock() - * while still more data remaining - * current dma ctx = next available context - * init current dma context - * calc size of the transfer for this dma context. We want to transfer up to page boundaries - * calc number of pages that need to be pinned for this dma - * pin host pages in memory - * generate descriptors for - * if prev dma ctx != NULL, wait for the prev dma to complete - * update host address, device address and ammount remaining - * wait for the last dma ctx to complete + * while data remains + * submit any pinned-but-unsubmitted ctxs when descriptors are available + * create a new ctx when queue space and pin budget allow + * pin pages immediately for the ctx + * advance queue tail and update host/device pointers + * wait on submitted ctxs from the head as needed to relieve pressure + * submit remaining pinned ctxs + * drain submitted ctxs from the head * unlock() - * free resources * + * ## For async mode ## + * We keep submitting dma contexts until we hit a threshold of pinned pages. + * Once we hit the threshold, we stop pinning pages and set the mm_struct for remote pinning later. + * TODO: liulily to add more detail once the complete async path is implemented. + * * Notes: * unpinning responsibilities. Up until a dma is successfully launched, this routine is responsible for unpinning * host memory. After that ndma_zerocopy_wait_for_completion() owns responsibility for unpinning pages. @@ -251,12 +253,13 @@ bool ndma_zerocopy_supported(void); * process context. * */ -int ndma_memcpy_zerocopy(struct neuron_device *nd, - u32 nc_id, - const nrt_tensor_batch_op_t *ops, - u32 num_ops, - dma_addr_t dev_base, - int qid, - bool direction); +int ndma_zerocopy_submit(struct neuron_device *nd, + u32 nc_id, + const nrt_tensor_batch_op_t *ops, + u32 num_ops, + dma_addr_t dev_base, + int qid, + bool direction, + u64 sequence_num); #endif diff --git a/neuron_dmabuf.c b/neuron_dmabuf.c index 6a510e1..2f53a07 100644 --- a/neuron_dmabuf.c +++ b/neuron_dmabuf.c @@ -28,7 +28,8 @@ #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 13, 0) +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 13, 0)) || \ + (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(10, 0))) MODULE_IMPORT_NS("DMA_BUF"); #else MODULE_IMPORT_NS(DMA_BUF); diff --git a/neuron_fw_io.c b/neuron_fw_io.c index 4222309..dbf9133 100644 --- a/neuron_fw_io.c +++ b/neuron_fw_io.c @@ -133,17 +133,71 @@ int fw_io_api_version_read(void * bar0, u32 *version) return ret; } -int fw_io_server_info_read(void *bar0, u32 *server_info) +int fw_io_server_info_read(void *bar0, int *server_id, int * rack_id) { int ret; + uint32_t server_info; void *addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_SERVER_RACK_ID_OFFSET; - ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, server_info, 1, true); + ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, &server_info, 1, true); if (ret) { pr_err("failed to get server info from the device, ret = %d\n", ret); + return -EIO; } - return ret; + if (server_id != NULL) { + *server_id = _REG_SERVERINFO_SVALID(server_info) ? _REG_SERVERINFO_SERVER(server_info) : -1; + } + if (rack_id != NULL) { + *rack_id = _REG_SERVERINFO_RVALID(server_info) ? _REG_SERVERINFO_RACK(server_info) : -1; + } + return 0; +} + +int fw_io_reservation_id_read(void *bar0, uint64_t *reservation_id) +{ + int ret; + uint32_t reservation_id_lo; + uint32_t reservation_id_hi; + void *addr; + + addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_RESERVATION_ID_LO; + ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, &reservation_id_lo, 1, true); + if (ret) { + pr_err("failed to get the lower 32 bits of the reservation id from the device\n"); + return -EIO; + } + addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_RESERVATION_ID_HI; + ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, &reservation_id_hi, 1, true); + if (ret) { + pr_err("failed to get the upper 32 bits of the reservation id from the device\n"); + return -EIO; + } + + *reservation_id = ((uint64_t)reservation_id_hi << 32) | reservation_id_lo; + return 0; +} + +int fw_io_instance_partition_sz_read(void *bar0, int *instance_sz, int *partition_sz) +{ + int ret; + uint32_t instance_partition_sz; + + void *addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_INSTANCE_PARTITION_SZ_OFFSET; + ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, &instance_partition_sz, 1, true); + if (ret) { + pr_err("failed to get instance/partition size info from the device, ret = %d\n", ret); + return -EIO; + } + + if (instance_sz != NULL) { + *instance_sz = _REG_INSTPARTSZ_INST(instance_partition_sz); + } + if (partition_sz != NULL) { + *partition_sz = _REG_INSTPARTSZ_VAL(instance_partition_sz) ? _REG_INSTPARTSZ_PART(instance_partition_sz) : -1; + } + + return 0; } int fw_io_device_id_read(void *bar0, u32 *device_id) @@ -229,12 +283,16 @@ static void dx_crc32c_add(const u8 *data, size_t len, u32 *csum) } } +// Note: fw_io_cmd_timeout_tbl is only used in fw_io_execute_request_new(). +// The timeouts only apply to cmd 3-5 as cmd 1-2 are still using legacy framework. +// In the future when switching cmd 1-2 to new framework, will likely need to +// bump timeout to 10s as fimrware side request can complete ranging from 100ms to 7s. static const u32 fw_io_cmd_timeout_tbl[FW_IO_CMD_MAX] = { 0, // cmd 0 (1000 * 1000 * 1), // cmd 1 (FW_IO_CMD_READ) (1000 * 1000 * 1), // cmd 2 (FW_IO_CMD_POST_TO_CW) - (1000 * 1000 * 60), // cmd 3 (FW_IO_CMD_SET_POWER_PROFILE) - (1000 * 1000 * 1), // cmd 4 (FW_IO_CMD_GET_DATA) + (1000 * 1000 * 90), // cmd 3 (FW_IO_CMD_SET_POWER_PROFILE) + (1000 * 1000 * 10), // cmd 4 (FW_IO_CMD_GET_DATA) (1000 * 1000 * 60), // cmd 5 (FW_IO_CMD_SET_FEATURE) }; @@ -409,7 +467,7 @@ int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *re } // Read response header - union fw_io_response_hdr resp_header; + union fw_io_response_hdr_new resp_header; reg_read32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_REQUEST_BASE_ADDR_HIG_OFFSET, &resp_header.reg.dw0); if (resp_header.hdr.sequence_number != ctx->next_seq_num) { @@ -423,9 +481,15 @@ int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *re if (data_size > 0 && resp != NULL) { u32 copy_size = min(resp_size, data_size); u32 *resp_data = (u32*)resp; - for (j = 0; j < (copy_size + 3) / 4; j++) { + for (j = 0; j < copy_size / 4; j++) { reg_read32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_DATA_OFFSET + j*4, &resp_data[j]); } + if (copy_size % 4) { + u32 remaining_resp = 0; + int idx = copy_size/4; + reg_read32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_DATA_OFFSET + idx*4, &remaining_resp); + memcpy(&resp_data[idx], &remaining_resp, copy_size % 4); + } } ret = 0; break; @@ -433,8 +497,8 @@ int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *re ctx->fw_io_err_count++; pr_err(KERN_ERR "seq: %u, cmd: %u failed %u\n", ctx->next_seq_num, command_id, resp_header.hdr.error_code); + ret = -1; if (resp_header.hdr.error_code == FW_IO_UNKNOWN_COMMAND) { - ret = -1; break; } } @@ -551,22 +615,23 @@ int fw_io_read_csr_array_readless(void **ptrs, u32 *values, u32 num_csrs) return -1; } -void fw_io_initiate_reset(void __iomem *bar0, bool device_reset, u32 tpb_reset_map) +void fw_io_initiate_reset(void __iomem *bar0, bool device_reset, u32 tpb_reset_map_lo, u32 tpb_reset_map_hi) { - u32 reset_type; - void *address; - if (device_reset) { - reset_type = FW_IO_RESET_TYPE_DEVICE; - } else { - reset_type = FW_IO_RESET_TYPE_TPB; - address = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_RESET_TPB_MAP_OFFSET; - reg_write32((u32 *)address, tpb_reset_map); - mb(); - } - address = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_RESET_OFFSET; - reg_write32((u32 *)address, reset_type); - mb(); - fw_io_trigger(bar0); + void __iomem *misc_ram_addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset; + u32 reset_type; + + if (device_reset) { + reset_type = FW_IO_RESET_TYPE_DEVICE; + } else { + reset_type = FW_IO_RESET_TYPE_TPB; + reg_write32(misc_ram_addr + FW_IO_REG_RESET_TPB_MAP_LO_OFFSET, tpb_reset_map_lo); + reg_write32(misc_ram_addr + FW_IO_REG_RESET_TPB_MAP_HI_OFFSET, tpb_reset_map_hi); + mb(); + } + + reg_write32(misc_ram_addr + FW_IO_REG_RESET_OFFSET, reset_type); + mb(); + fw_io_trigger(bar0); } bool fw_io_is_reset_initiated(void __iomem *bar0) @@ -762,6 +827,27 @@ int fw_io_set_power_profile(struct fw_io_ctx *ctx, uint32_t profile) return fw_io_execute_request_new(ctx, FW_IO_CMD_SET_POWER_PROFILE, (u8 *)&data, sizeof(data), NULL, 0); } +int fw_io_get_performance_profile(struct fw_io_ctx *ctx, uint32_t *profile) +{ + struct fw_io_get_data_request req = {0}; + struct fw_io_get_perfprofile_response resp = {0}; + int ret; + if (!ctx || !profile) { + return -EINVAL; + } + + req.type = 1; + + ret = fw_io_execute_request_new(ctx, FW_IO_CMD_GET_DATA, (u8 *)&req, sizeof(req), (u8 *)&resp, sizeof(resp)); + if (ret == 0) { + *profile = (uint32_t)resp.profile; + } else { + pr_err("failed to get profile, ret = %d\n", ret); + } + + return ret; +} + int fw_io_enable_throttling_notifications(struct fw_io_ctx *ctx, bool enable) { /* * Note: @@ -784,4 +870,26 @@ int fw_io_enable_throttling_notifications(struct fw_io_ctx *ctx, bool enable) } return fw_io_execute_request_new(ctx, FW_IO_CMD_SET_FEATURE, &features, sizeof(features), NULL, 0); -} \ No newline at end of file +} + +int fw_io_get_available_profiles(struct fw_io_ctx *ctx, u16 feature, u8 *num_profiles, u8 bitmap[32]) +{ + struct fw_io_get_available_profiles_request req; + struct fw_io_get_available_profiles_response response; + int ret; + if (!ctx) { + return -EINVAL; + } + + req.type = 2; + req.operation = feature; + + ret = fw_io_execute_request_new(ctx, FW_IO_CMD_GET_DATA, (u8*)&req, sizeof(req), (u8*)&response, sizeof(response)); + if (ret) { + return ret; + } + + *num_profiles = response.num_profiles; + memcpy(bitmap, response.profiles_bitmap, sizeof(response.profiles_bitmap)); + return 0; +} diff --git a/neuron_fw_io.h b/neuron_fw_io.h index 3faf65c..83a5709 100644 --- a/neuron_fw_io.h +++ b/neuron_fw_io.h @@ -20,14 +20,33 @@ union fw_io_request_hdr { } reg; }; +// Note: Firmware updated to include crc32 field in response header, but +// to maintain backward compatibility, keeping original response header +// struct and adding crc32 field to new header. + +// Response header for legacy protocol +// Used by fw_io_execute_request() for legacy commands union fw_io_response_hdr { struct { u8 sequence_number; // request sequence number u8 error_code; // 0 means request was successfully completed u16 size; // response size in bytes including this header } hdr; + u32 dw0; // bytes 0-3: sequence_number, error_code, size +}; + +// Response header for new protocol +// Used by fw_io_execute_request_new() for new commands +union fw_io_response_hdr_new { + struct { + u8 sequence_number; // request sequence number + u8 error_code; // 0 means request was successfully completed + u16 size; // response size in bytes including this header + u32 crc32; + } hdr; struct { - u32 dw0; + u32 dw0; // bytes 0-3: sequence_number, error_code, size + u32 dw1; // bytes 4-7: crc32 } reg; }; @@ -41,6 +60,11 @@ struct fw_io_response { u8 data[]; }; +struct fw_io_response_new { + union fw_io_response_hdr_new response_hdr; + u8 data[]; +}; + union fw_io_req_perfprofile_data { struct { uint32_t reserved; @@ -53,8 +77,30 @@ union fw_io_req_perfprofile_data { }; struct fw_io_get_data_request { - uint8_t type; // fw_io_data_request_type - uint8_t reserved[3]; // reserved for future use/alignment + uint8_t type; +}; + +struct fw_io_get_perfprofile_response { + uint8_t reserved[4]; + uint8_t profile; + uint8_t voltage_margin; + uint8_t frequency; + uint8_t ocw; +}; + +struct fw_io_get_available_profiles_request { + uint8_t type; // must be 2 + uint16_t operation; +} __packed; + +struct fw_io_get_available_profiles_response { + uint8_t num_profiles; + uint8_t profiles_bitmap[32]; +}; + +enum fw_io_get_available_profiles_feature { + FW_IO_AVAILABLE_PERF_PROFILES_ALL = 0, + FW_IO_AVAILABLE_PERF_PROFILES_HBM_7200 = 5 }; // Feature bitmap for FW_IO_CMD_SET_FEATURE @@ -86,7 +132,7 @@ enum { // Bitmap of PIR reset types to be written to FW_IO_REG_RESET_OFFSET enum { FW_IO_RESET_TYPE_DEVICE = 1, - FW_IO_RESET_TYPE_TPB = 2 // Requires FW_IO_REG_RESET_TPB_MAP_OFFSET to be populated with a tpb map prior to use + FW_IO_RESET_TYPE_TPB = 2 // Requires FW_IO_REG_RESET_TPB_MAP_LO_OFFSET to be populated with a tpb map prior to use }; // offsets in MISC RAM for FWIO @@ -99,6 +145,11 @@ enum { // - The value of this register is used to determine the offset of other registers. FW_IO_REG_API_VERSION_OFFSET = 0x00, + + // MISC RAM instance/partition size info + // (0:5) instance size, 16:30 partition size, 31 partition size valid + FW_IO_REG_INSTANCE_PARTITION_SZ_OFFSET = 0x30, + // MISC RAM slots for serial number for V2 // - The lower 32 bits and the upper 32 bits together represent the 64-bit serial number. FW_IO_REG_SERIAL_NUMBER_LO_OFFSET = 0x38, // 14 * 4 bytes @@ -125,6 +176,10 @@ enum { FW_IO_REG_POWER_UTIL_D1_OFFSET = 0x58, // 22 * 4 bytes FW_IO_REG_HBM_REPAIR_STATE_OFFSET = 0x64, // 25 * 4 bytes + // + + FW_IO_REG_RESERVATION_ID_HI = 0x80, // 32 * 4 bytes + FW_IO_REG_RESERVATION_ID_LO = 0x84, // 33 * 4 bytes FW_IO_REG_RUNTIME_RESERVED0 = 0xC0, // 0xC0 to 0xF0 @@ -138,7 +193,8 @@ enum { FW_IO_REG_POD_SERNUM_LO = 0x198, FW_IO_REG_RUNTIME_RESERVED1 = 0x1a0, // 0x1a0 to 1d0 - FW_IO_REG_RESET_TPB_MAP_OFFSET = 0x1d8, + FW_IO_REG_RESET_TPB_MAP_HI_OFFSET = 0x1d4, + FW_IO_REG_RESET_TPB_MAP_LO_OFFSET = 0x1d8, FW_IO_REG_RESET_OFFSET = 0x1ec, FW_IO_REG_REQUEST_BASE_ADDR_LOW_OFFSET = 0x1f4, FW_IO_REG_REQUEST_BASE_ADDR_HIG_OFFSET = 0x1f0, @@ -147,7 +203,47 @@ enum { FW_IO_REG_TRIGGER_INT_NOSEC_OFFSET = 0x800, FW_IO_REG_ACK_OFFSET = 0xf0, }; - + +// Instance/partition register field decode +// +#define _REG_INSTPARTSZ_INSTBITS 6 +#define _REG_INSTPARTSZ_INSTSHIFT 0 +#define _REG_INSTPARTSZ_INSTMASK ((1 << _REG_INSTPARTSZ_INSTBITS)-1) +#define _REG_INSTPARTSZ_INST(inst) (((inst) >> _REG_INSTPARTSZ_INSTSHIFT) & _REG_INSTPARTSZ_INSTMASK) + +#define _REG_INSTPARTSZ_PARTBITS 15 +#define _REG_INSTPARTSZ_PARTSHIFT 16 +#define _REG_INSTPARTSZ_PARTMASK ((1 << _REG_INSTPARTSZ_PARTBITS)-1) +#define _REG_INSTPARTSZ_PART(part) (((part) >> _REG_INSTPARTSZ_PARTSHIFT) & _REG_INSTPARTSZ_PARTMASK) + +#define _REG_INSTPARTSZ_VALBITS 1 +#define _REG_INSTPARTSZ_VALSHIFT 31 +#define _REG_INSTPARTSZ_VALMASK ((1 << _REG_INSTPARTSZ_VALBITS)-1) +#define _REG_INSTPARTSZ_VAL(val) (((val) >> _REG_INSTPARTSZ_VALSHIFT) & _REG_INSTPARTSZ_VALMASK) + +// server info register field decode +// +#define _REG_SERVERINFO_SERVERBITS 15 +#define _REG_SERVERINFO_SERVERSHIFT 0 +#define _REG_SERVERINFO_SERVERMASK ((1 << _REG_SERVERINFO_SERVERBITS)-1) +#define _REG_SERVERINFO_SERVER(serv) (((serv) >> _REG_SERVERINFO_SERVERSHIFT) & _REG_SERVERINFO_SERVERMASK) + +#define _REG_SERVERINFO_SVALIDBITS 1 +#define _REG_SERVERINFO_SVALIDSHIFT 15 +#define _REG_SERVERINFO_SVALIDMASK ((1 << _REG_SERVERINFO_SVALIDBITS)-1) +#define _REG_SERVERINFO_SVALID(sval) (((sval) >> _REG_SERVERINFO_SVALIDSHIFT) & _REG_SERVERINFO_SVALIDMASK) + +#define _REG_SERVERINFO_RACKBITS 15 +#define _REG_SERVERINFO_RACKSHIFT 16 +#define _REG_SERVERINFO_RACKMASK ((1 << _REG_SERVERINFO_RACKBITS)-1) +#define _REG_SERVERINFO_RACK(rack) (((rack) >> _REG_SERVERINFO_RACKSHIFT) & _REG_SERVERINFO_RACKMASK) + +#define _REG_SERVERINFO_RVALIDBITS 1 +#define _REG_SERVERINFO_RVALIDSHIFT 31 +#define _REG_SERVERINFO_RVALIDMASK ((1 << _REG_SERVERINFO_RVALIDBITS)-1) +#define _REG_SERVERINFO_RVALID(rval) (((rval) >> _REG_SERVERINFO_RVALIDSHIFT) & _REG_SERVERINFO_RVALIDMASK) + +// #define FW_IO_REG_METRIC_BUF_SZ 128 struct fw_io_ctx { @@ -287,11 +383,15 @@ int fw_io_post_metric_new(struct fw_io_ctx *ctx, u8 *data, u32 size); * * @bar0: Device's BAR0 base address * @device_reset: True if we are doing a device-level reset - * @tpb_reset_map: If device_reset is false (tpb reset), bitmap of blocks to reset - * [1:0] NC mask - * [13:8] TopSp mask + * @tpb_reset_map_lo: If device_reset is false (tpb reset), bitmap of blocks to reset (bits 0-31) + * [7:0] TPB mask + * [15:8] SDMA group mask + * [23:16] TOP_SP mask + * [31:24] CC_TOP mask + * @tpb_reset_map_hi: + * [3:0] Top-Level DMA group mask */ -void fw_io_initiate_reset(void __iomem *bar0, bool device_reset, u32 tpb_reset_map); +void fw_io_initiate_reset(void __iomem *bar0, bool device_reset, u32 tpb_reset_map_lo, u32 tpb_reset_map_hi); /** * fw_io_is_reset_initiated() - Check if local reset is initiated or not. @@ -319,10 +419,29 @@ int fw_io_read_counters(struct fw_io_ctx *ctx, uint64_t addr_in[], uint32_t val_ /** * fw_io_server_info_read() - Read server info * @param bar - from bar - * @param server_info - server info containing rack & server ids + * @param server_id - server id or -1 if invalid + * @param rack_id - rack id or -1 if invalid + * @return 0 on success. + */ +int fw_io_server_info_read(void *bar0, int *server_id, int * rack_id); + + +/** + * fw_io_reservation_id_read() - Read reservation id + * @param bar - from bar + * @param reservation_id - server reservation id + * @return 0 on success. + */ +int fw_io_reservation_id_read(void *bar0, uint64_t *reservation_id); + +/** + * fw_io_instance_partition_sz_read() - instance/partition sizes + * @param bar - from bar + * @param instance_sz - instance size. -1 if invalid + * @param partition_sz - partition size. -1 if invalid * @return 0 on success. */ -int fw_io_server_info_read(void *bar0, u32 *server_info); +int fw_io_instance_partition_sz_read(void *bar0, int *instance_sz, int *partition_sz); /** * fw_io_device_id_read() - Read device id @@ -433,6 +552,14 @@ int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *re */ int fw_io_set_power_profile(struct fw_io_ctx *ctx, uint32_t profile); +/** + * fw_io_get_performance_profile() - Get current performance profile + * @param ctx: FWIO context + * @param profile: Pointer to store the current profile value + * @return 0 on success, negative on failure + */ +int fw_io_get_performance_profile(struct fw_io_ctx *ctx, uint32_t *profile); + /** * fw_io_enable_throttling_notifications() - Enable throttling notifications * @param ctx: FWIO context @@ -440,4 +567,14 @@ int fw_io_set_power_profile(struct fw_io_ctx *ctx, uint32_t profile); * @return 0 on success, negative on failure */ int fw_io_enable_throttling_notifications(struct fw_io_ctx *ctx, bool enable); + +/** + * fw_io_get_available_profiles() - Get available profiles + * @param ctx: FWIO context + * @param feature: Profiles with a particular feature (0 for all profiles supported by instance) + * @param num_profiles: Number of valid profiles in response + * @param bitmap: Bitmap of supported profiles in response + */ +int fw_io_get_available_profiles(struct fw_io_ctx *ctx, u16 feature, u8 *num_profiles, u8 bitmap[32]); + #endif diff --git a/neuron_ioctl.h b/neuron_ioctl.h index bcc9c6e..b20170b 100644 --- a/neuron_ioctl.h +++ b/neuron_ioctl.h @@ -637,13 +637,27 @@ struct neuron_ioctl_get_va_placement { __s32 device_index; // [out] Neuron device index (negative if VA does not represent Neuron memory) __s32 hbm_index; // [out] HBM index }; +struct neuron_ioctl_available_perf_profiles { + __u16 requested_feature; // [in] - 0 means all available profiles (no feature filter) + __u8 num_profiles; // [out] + __u8 bitmap[32]; // [out] firmware limited to 256 profiles +}; + +struct neuron_ioctl_get_async_h2t_dma_compl_queues { + __u32 nc_id; /* [in] neuron core id */ + __u32 qid_bitmap; /* [in] bitmap of dma queues requested */ + struct { + __u64 mmap_offset; /* [out] mmap offset of each completion queue */ + __u32 mmap_size; /* [out] mmap size of queue + metadata */ + } compl_queue_info[16]; +}; + #define NEURON_IOCTL_BASE 'N' /* Deprecated reset related IOCTLs. Now it would always return success. */ #define NEURON_IOCTL_DEVICE_RESET _IO(NEURON_IOCTL_BASE, 1) #define NEURON_IOCTL_DEVICE_READY _IOR(NEURON_IOCTL_BASE, 2, __u8) -#define NEURON_IOCTL_DEVICE_RESET_STATUS _IOR(NEURON_IOCTL_BASE, 106, __u8) /** Returns devices information and connection topology. */ #define NEURON_IOCTL_DEVICE_INFO _IOR(NEURON_IOCTL_BASE, 3, struct neuron_ioctl_device_info *) @@ -665,9 +679,6 @@ struct neuron_ioctl_get_va_placement { /** Allocated memory and return a memory_handle. */ #define NEURON_IOCTL_MEM_ALLOC _IOR(NEURON_IOCTL_BASE, 21, struct neuron_ioctl_mem_alloc *) -#define NEURON_IOCTL_MEM_ALLOC_V2 _IOR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2 *) // V2 here refers to neuron 2.x, not arch type -#define NEURON_IOCTL_MEM_ALLOC_V2MT _IOR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2_mem_type) // just V2 with additional field mem_type -#define NEURON_IOCTL_MEM_ALLOC_V2MT64 _IOR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2_mem_type64) // V2 + mem_type + pad /** Free given memory_handle. */ #define NEURON_IOCTL_MEM_FREE _IOR(NEURON_IOCTL_BASE, 22, struct neuron_ioctl_mem_free *) @@ -704,8 +715,6 @@ struct neuron_ioctl_get_va_placement { /** Initializes given DMA queue */ #define NEURON_IOCTL_DMA_QUEUE_INIT _IOR(NEURON_IOCTL_BASE, 33, struct neuron_ioctl_dma_queue_init *) -#define NEURON_IOCTL_DMA_QUEUE_INIT_BATCH _IOR(NEURON_IOCTL_BASE, 133, struct neuron_ioctl_dma_queue_init_batch) - /** Releases given DMA queue */ #define NEURON_IOCTL_DMA_QUEUE_RELEASE _IOR(NEURON_IOCTL_BASE, 34, struct neuron_ioctl_dma_queue_release *) /** Starts DMA transfer of given number of descriptors */ @@ -772,6 +781,11 @@ struct neuron_ioctl_get_va_placement { /** Returns pci device information - only for devices opened by the calling proceess (deprecated, don't use) */ #define NEURON_IOCTL_DEVICE_BDF _IOR(NEURON_IOCTL_BASE, 101, struct neuron_ioctl_device_bdf *) +/** Allocated memory and return a memory_handle. */ +#define NEURON_IOCTL_MEM_ALLOC_V2 _IOR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2 *) // V2 here refers to neuron 2.x, not arch type +#define NEURON_IOCTL_MEM_ALLOC_V2MT _IOR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2_mem_type) // just V2 with additional field mem_type +#define NEURON_IOCTL_MEM_ALLOC_V2MT64 _IOR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2_mem_type64) // V2 + mem_type + pad + /** Resets the requested NC (-1 for full device) */ #define NEURON_IOCTL_NC_RESET _IOR(NEURON_IOCTL_BASE, 103, struct neuron_ioctl_device_reset *) @@ -782,6 +796,9 @@ struct neuron_ioctl_get_va_placement { #define NEURON_IOCTL_PROGRAM_ENGINE_NC _IOWR(NEURON_IOCTL_BASE, 105, struct neuron_ioctl_program_engine_nc *) #define NEURON_IOCTL_PROGRAM_ENGINE_NC64 _IOWR(NEURON_IOCTL_BASE, 105, struct neuron_ioctl_program_engine_nc64) +/* Deprecated reset related IOCTLs. Now it would always return success. */ +#define NEURON_IOCTL_DEVICE_RESET_STATUS _IOR(NEURON_IOCTL_BASE, 106, __u8) + /** Returns pci device information for any Neuron devices (not just these opened by the calling process */ #define NEURON_IOCTL_DEVICE_BDF_EXT _IOR(NEURON_IOCTL_BASE, 106, struct neuron_ioctl_device_bdf_ext *) @@ -847,7 +864,13 @@ struct neuron_ioctl_get_va_placement { #define NEURON_IOCTL_GET_VA_PLACEMENT _IOW(NEURON_IOCTL_BASE, 131, struct neuron_ioctl_get_va_placement) -// Note: 133 is taken by NEURON_IOCTL_DMA_QUEUE_INIT_BATCH -#define NEURON_IOCTL_MAX 132 +#define NEURON_IOCTL_GET_PERFORMANCE_PROFILE _IOWR(NEURON_IOCTL_BASE, 132, struct neuron_ioctl_power_profile) + +/** Batch DMA initialization given DMA queue */ +#define NEURON_IOCTL_DMA_QUEUE_INIT_BATCH _IOR(NEURON_IOCTL_BASE, 133, struct neuron_ioctl_dma_queue_init_batch) + +#define NEURON_IOCTL_AVAILABLE_PERF_PROFILES _IOWR(NEURON_IOCTL_BASE, 134, struct neuron_ioctl_available_perf_profiles) + +#define NEURON_IOCTL_GET_ASYNC_H2T_DMA_COMPL_QUEUES _IOWR(NEURON_IOCTL_BASE, 135, struct neuron_ioctl_get_async_h2t_dma_compl_queues) #endif diff --git a/neuron_mempool.c b/neuron_mempool.c index 20e4436..b5b2d2f 100644 --- a/neuron_mempool.c +++ b/neuron_mempool.c @@ -133,7 +133,7 @@ static void mc_remove_node(struct rb_root *root, struct mem_chunk *mc) * Return: 0 if pool is created, a negative error code otherwise. */ -static int mp_init_device_mem(struct mempool *mp, struct mempool_set *mpset, +static int mp_init_device_mem(struct neuron_mempool *mp, struct neuron_mempool_set *mpset, u64 start_addr, size_t pool_size, u32 dram_channel, u32 dram_region) { int ret; @@ -205,7 +205,7 @@ static int mp_init_device_mem(struct mempool *mp, struct mempool_set *mpset, * Frees all backing pages allocated for reserved host_mem pool. * Does opposite work of mp_init_hrm_pool */ -static void mp_destroy_hrm_pool(struct mempool *mp) +static void mp_destroy_hrm_pool(struct neuron_mempool *mp) { int i = 0; if (mp->page_va_array == NULL) @@ -232,13 +232,13 @@ static void mp_destroy_hrm_pool(struct mempool *mp) * Any page allocation failure is ignored. * * @mp: pointer to mempool that needs to be initialized - * @mpset: pointer to parent mempool_set + * @mpset: pointer to parent neuron_mempool_set * @page_size: backing host memory's page size * @page_count: Max number of pages to allocate * * Return: 0 if pool is created, a negative error code otherwise. */ -static int mp_init_hrm_pool(struct mempool *mp, struct mempool_set *mpset, +static int mp_init_hrm_pool(struct neuron_mempool *mp, struct neuron_mempool_set *mpset, u32 page_size, u32 page_count) { int ret; @@ -299,7 +299,7 @@ static int mp_init_hrm_pool(struct mempool *mp, struct mempool_set *mpset, /** * Frees all the chunks associated with the mempool and releases the mempool. */ -static void mp_destroy_gen_pool(struct mempool *mp) +static void mp_destroy_gen_pool(struct neuron_mempool *mp) { BUG_ON(mp == NULL); if (!mp->initialized) @@ -315,6 +315,58 @@ static void mp_destroy_gen_pool(struct mempool *mp) } } +// Upper 16MB is used internally by the firmware, don't use it in the allocation pool +#define MEMPOOL_CARVEOUT_SIZE 0x1000000 // 16MB +/** + * mpset_block_carveout_regions() + * + * @param nd: neuron device + * @param mpset: pointer to mpset + * @param device_dram_addr: DRAM Channel addresses + * @param device_dram_size: DRAM Channel sizes + * @return int: 0 on success, o/w on failure + */ +static int mpset_block_carveout_regions(struct neuron_device *nd, struct neuron_mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size) +{ + int ret; + u64 region_sz; + int channel = 0, region = 0; + + /* + * Block carve out regions: Upper 16 MB is used internally by firmware + * + * Ideally we would carve out by simply changing the start address of the chunk; + * however, that breaks aligned allocation in 4.x kernel versions (fixed in 5.x). + * Fix here: + * commit 52fbf1134d479234d7e64ba9dcbaea23405f229e + * Author: Alexey Skidanov + * Date: Thu Jan 3 15:26:44 2019 -0800 + * + * lib/genalloc.c: fix allocation of aligned buffer from non-aligned chunk + */ + for (channel = 0; channel < mpset->num_channels; channel++) { + region_sz = device_dram_size[channel] / mpset->mp_device_num_regions; + for (region = 0; region < mpset->mp_device_num_regions; region++) { + const dma_addr_t start_addr = device_dram_addr[channel] + (region * region_sz); + struct mem_chunk *mc = NULL; + u32 nc_id = channel; + ret = mc_alloc_align(nd, MC_LIFESPAN_DEVICE, MEMPOOL_CARVEOUT_SIZE, 0, MEM_LOC_DEVICE, channel, region, nc_id, NEURON_MEMALLOC_TYPE_NCDEV_DEVICE, &mc); + if (ret) { + pr_err("failed to allocate hbm carveout region: ret=%d\n", ret); + return -ENOMEM; + } + if (mc->pa != start_addr) { + pr_err("carve out mc not offset 0!"); + mc_free(&mc); + return -EINVAL; + } + } + ndhal->ndhal_mpset.device_dram_effective_base_addr[channel] = device_dram_addr[channel] + MEMPOOL_CARVEOUT_SIZE; + } + + return 0; +} + /** * mpset_init_device_pools() - Prepare device mp in given mpset. * @@ -323,7 +375,7 @@ static void mp_destroy_gen_pool(struct mempool *mp) * * Return: 0 if initialization succeeds, a negative error code otherwise. */ -static int mpset_init_device_pools(struct mempool_set *mpset, struct neuron_device *nd) +static int mpset_init_device_pools(struct neuron_mempool_set *mpset, struct neuron_device *nd) { int ret; int channel = 0, region = 0; @@ -345,7 +397,7 @@ static int mpset_init_device_pools(struct mempool_set *mpset, struct neuron_devi } } - ret = ndhal->ndhal_mpset.mpset_block_carveout_regions(nd, mpset, device_dram_addr, device_dram_size); + ret = mpset_block_carveout_regions(nd, mpset, device_dram_addr, device_dram_size); if (ret) { goto fail; } @@ -358,7 +410,7 @@ static int mpset_init_device_pools(struct mempool_set *mpset, struct neuron_devi mp_destroy_gen_pool(&mpset->mp_device[channel][region]); } } - memset(mpset, 0, sizeof(struct mempool_set)); + memset(mpset, 0, sizeof(struct neuron_mempool_set)); return ret; } @@ -383,7 +435,7 @@ static int mpset_print_lifespan_list(const char *name, struct list_head *head) /** Verifies all MC allocated from the mpset is freed. */ -static void mpset_verify_all_mc_freed(struct mempool_set *mpset) +static void mpset_verify_all_mc_freed(struct neuron_mempool_set *mpset) { int i, count; count = mpset_print_lifespan_list("LOCAL", &mpset->mc_lifespan_local_head); @@ -396,7 +448,7 @@ static void mpset_verify_all_mc_freed(struct mempool_set *mpset) BUG_ON(count != 0); } -int mpset_constructor(struct mempool_set *mpset, void *pdev, struct neuron_device *nd) +int mpset_constructor(struct neuron_mempool_set *mpset, void *pdev, struct neuron_device *nd) { int host_page_index; u64 host_allocated_size = 0; @@ -442,9 +494,9 @@ int mpset_constructor(struct mempool_set *mpset, void *pdev, struct neuron_devic } static void mpset_free_lifespan_list(struct list_head *head, struct list_head *new_head); -static struct list_head * mpset_get_lifespan_head(struct mempool_set *mpset, enum mc_lifespan lifespan); +static struct list_head * mpset_get_lifespan_head(struct neuron_mempool_set *mpset, enum mc_lifespan lifespan); -void mpset_destructor(struct mempool_set *mpset) +void mpset_destructor(struct neuron_mempool_set *mpset) { int i, channel, region; struct list_head *head; @@ -477,7 +529,7 @@ void mpset_destructor(struct mempool_set *mpset) mutex_unlock(&mpset->lock); } -struct mem_chunk *mpset_search_mc(struct mempool_set *mp, phys_addr_t pa) +struct mem_chunk *mpset_search_mc(struct neuron_mempool_set *mp, phys_addr_t pa) { struct rb_node *node = mp->root.rb_node; /* top of the tree */ @@ -495,7 +547,7 @@ struct mem_chunk *mpset_search_mc(struct mempool_set *mp, phys_addr_t pa) return NULL; } -static inline struct list_head * mpset_get_lifespan_head(struct mempool_set *mpset, enum mc_lifespan lifespan) +static inline struct list_head * mpset_get_lifespan_head(struct neuron_mempool_set *mpset, enum mc_lifespan lifespan) { struct list_head *head = NULL; if (lifespan == MC_LIFESPAN_LOCAL) { @@ -514,7 +566,7 @@ static inline struct list_head * mpset_get_lifespan_head(struct mempool_set *mps static void mc_add_to_lifespan_list(struct mem_chunk *mc) { - struct mempool_set *mpset = mc->mpset; + struct neuron_mempool_set *mpset = mc->mpset; struct list_head *head; head = mpset_get_lifespan_head(mpset, mc->lifespan); list_add(&mc->lifespan_list, head); @@ -554,7 +606,7 @@ static void mpset_free_lifespan_list(struct list_head *head, struct list_head *n } } -void mpset_free_expired_mc(struct mempool_set *mpset, enum mc_lifespan lifespan) +void mpset_free_expired_mc(struct neuron_mempool_set *mpset, enum mc_lifespan lifespan) { struct list_head *head, *next_head; head = mpset_get_lifespan_head(mpset, lifespan); @@ -562,7 +614,7 @@ void mpset_free_expired_mc(struct mempool_set *mpset, enum mc_lifespan lifespan) mpset_free_lifespan_list(head, next_head); } -static inline u64 get_offset_for_scratchpad_alloc(const struct mempool *mp, u64 alloc_size) +static inline u64 get_offset_for_scratchpad_alloc(const struct neuron_mempool *mp, u64 alloc_size) { /* Contiguous scratchpad grows backwards from the end of the main genpool @@ -581,8 +633,8 @@ static int mc_alloc_internal(struct neuron_device *nd, enum mc_lifespan lifespan struct mem_chunk **result) { struct mem_chunk *mc; - struct mempool *mp = NULL; - struct mempool_set *mpset = &nd->mpset; + struct neuron_mempool *mp = NULL; + struct neuron_mempool_set *mpset = &nd->mpset; struct gen_pool *pool = NULL; struct gen_pool *alt_pool = NULL; int ret = 0; @@ -799,7 +851,7 @@ int mc_alloc_align(struct neuron_device *nd, enum mc_lifespan lifespan, u64 size void mc_inc_refcount(struct mem_chunk *mc) { - struct mempool_set *mpset = mc->mpset; + struct neuron_mempool_set *mpset = mc->mpset; mutex_lock(&mpset->lock); mc->ref_count++; mutex_unlock(&mpset->lock); @@ -807,7 +859,7 @@ void mc_inc_refcount(struct mem_chunk *mc) void mc_free(struct mem_chunk **mcp) { - struct mempool_set *mpset; + struct neuron_mempool_set *mpset; struct mem_chunk *mc = *mcp; BUG_ON(mc == NULL); @@ -845,7 +897,7 @@ void mc_free(struct mem_chunk **mcp) mpset->host_mem_size -= mc->size; nsysfsmetric_dec_counter(mpset->nd, NON_NDS_METRIC, NON_NDS_COUNTER_HOST_MEM, mc->nc_id, mc->size, false); } else if (mc->mem_location == MEM_LOC_DEVICE) { - struct mempool *mp; + struct neuron_mempool *mp; mp = &mpset->mp_device[mc->dram_channel][mc->dram_region]; gen_pool_free(mc->gen_pool, (u64)mc->va, mc->size); mp->allocated_size -= mc->size; @@ -878,7 +930,7 @@ void mc_free(struct mem_chunk **mcp) int mc_dump_all_chunks(struct neuron_device *nd, u32 channel, u32 num_entries_in, struct neuron_ioctl_mem_chunk_info *data, u32 *num_entries_out) { - struct mempool_set *mpset = &nd->mpset; + struct neuron_mempool_set *mpset = &nd->mpset; u32 cnt = 0; struct rb_node *node; diff --git a/neuron_mempool.h b/neuron_mempool.h index 2fcdf10..61cdd5f 100644 --- a/neuron_mempool.h +++ b/neuron_mempool.h @@ -9,7 +9,7 @@ * 2. mempool/mp - Is a pool of memory backed either device DRAM or host DRAM. * For device memory it uses gen_pool allocator to allocate memory. * For host memory it directly uses kmalloc(). - * 3. mempool_set/mpset - Is collection for mp for given neuron device. + * 3. neuron_mempool_set/mpset - Is collection for mp for given neuron device. */ #ifndef NEURON_MEMPOOL_H @@ -39,11 +39,11 @@ enum mem_location { * Device is memory is split in to chunks and allocated. * Uses genpool allocator in the backend. */ -struct mempool { +struct neuron_mempool { char name[32]; // friendly name bool initialized; // True if initialized. - struct mempool_set *mpset; // parent mpset + struct neuron_mempool_set *mpset; // parent mpset enum mem_location mem_location; // location of the memory u32 dram_channel; // DRAM channel valid only if location is device @@ -75,16 +75,16 @@ struct mempool { // Number for MPs for host allocation #define MP_HOST_RESERVE_MEMORY_POOL_COUNT 4 -struct mempool_set { +struct neuron_mempool_set { struct mutex lock; struct neuron_device *nd; // backponter to neuron_device u32 mp_device_num_regions; // number of regions in the device pool u32 num_channels; // number of regions in the device pool - struct mempool mp_device[MAX_DRAM_CHANNELS][MAX_DDR_REGIONS]; // device memory pools + struct neuron_mempool mp_device[MAX_DRAM_CHANNELS][MAX_DDR_REGIONS]; // device memory pools - struct mempool mp_hrm[MP_HOST_RESERVE_MEMORY_POOL_COUNT]; // host reserve memory pools + struct neuron_mempool mp_hrm[MP_HOST_RESERVE_MEMORY_POOL_COUNT]; // host reserve memory pools // linked list head to store mem_chunk of different lifespan struct list_head mc_lifespan_local_head; @@ -127,8 +127,8 @@ struct mem_chunk { u64 size; // chunk size - struct mempool *mp; // backpointer to mp - struct mempool_set *mpset; // back pointer to mpset + struct neuron_mempool *mp; // backpointer to mp + struct neuron_mempool_set *mpset; // back pointer to mpset struct gen_pool *gen_pool; // pointer to genpool u32 dram_channel; // DRAM channel @@ -160,14 +160,14 @@ struct mem_chunk { * * Return: 0 if initialization succeeds, a negative error code otherwise. */ -int mpset_constructor(struct mempool_set *mpset, void *pdev, struct neuron_device *nd); +int mpset_constructor(struct neuron_mempool_set *mpset, void *pdev, struct neuron_device *nd); /** * mpset_destructor() - Free all mp in the set. * * @mpset: Pointer to mpset which need to be destroyed. */ -void mpset_destructor(struct mempool_set *mpset); +void mpset_destructor(struct neuron_mempool_set *mpset); /** mpset_search_mc() - Find memory chunk which maps given physical address * @@ -176,7 +176,7 @@ void mpset_destructor(struct mempool_set *mpset); * * Return: mem chunk that has pa on success, NULL on failure */ -struct mem_chunk *mpset_search_mc(struct mempool_set *mp, phys_addr_t pa); +struct mem_chunk *mpset_search_mc(struct neuron_mempool_set *mp, phys_addr_t pa); /** * mc_alloc_align() - Allocate a memory chunk of size from given mpset, with alignment @@ -210,7 +210,7 @@ void mc_free(struct mem_chunk **mcp); * @mpset: Pointer to mpset * @lifespan: Lifespan list to use */ -void mpset_free_expired_mc(struct mempool_set *mpset, enum mc_lifespan lifespan); +void mpset_free_expired_mc(struct neuron_mempool_set *mpset, enum mc_lifespan lifespan); /** * mc_inc_refcount() - Increases reference count of the given mc. diff --git a/neuron_metrics.c b/neuron_metrics.c index b849262..65185fb 100644 --- a/neuron_metrics.c +++ b/neuron_metrics.c @@ -33,6 +33,7 @@ MODULE_PARM_DESC(nmetric_log_posts, "1: send metrics to CW, 2: send metrics to t static int nmetric_counters_buf_size = sizeof(u64) * NMETRIC_COUNTER_COUNT; static int nmetric_versions_buf_size = sizeof(struct nmetric_versions) * NMETRIC_VERSION_COUNT; static int nmetric_constants_buf_size = sizeof(char) * NMETRIC_CONSTANTS_COUNT * (NEURON_METRICS_VERSION_STRING_MAX_LEN + 1); +static int nmetric_ecc_err_buf_size = sizeof(u64) * NMETRIC_ECC_ERR_COUNT; static char nmetric_constant_metrics[NMETRIC_CONSTANTS_COUNT][NEURON_METRICS_VERSION_STRING_MAX_LEN + 1]; static const char nmetric_instance_id_path[] = "/sys/devices/virtual/dmi/id/board_asset_tag"; @@ -67,6 +68,9 @@ enum nmetric_cw_id { // Ultraserver mode configured on device (only for ULTRASERVER/PDS platforms), values defined in neuron_ultraserver_mode enum NMETRIC_CW_ID_ULTRASERVER_MODE = 58, + // Workload ID based off hashed neff id + NMETRIC_CW_ID_AGG_NEFF_ID = 80, + // Platform Utilization Metrics // Percentage of time that the neuron device was executing NEFFs in a given interval, aggregated across NCs // For example, a ND with full utilization of one core with the other idle, will be reported as 50% @@ -202,7 +206,6 @@ static const nmetric_def_t nmetric_defs[] = { NMETRIC_COUNTER_DEF(18, POST_TIME_TICK_0, NMETRIC_CW_ID_NERR_OOB, NDS_NC_COUNTER_OOB), NMETRIC_COUNTER_DEF(19, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_HW_ERR_COLLECTIVES, NDS_EXT_NC_COUNTER_HW_ERR_COLLECTIVES), - NMETRIC_COUNTER_DEF(20, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_HW_ERR_HBM_UE, NDS_EXT_NC_COUNTER_HW_ERR_HBM_UE), NMETRIC_COUNTER_DEF(21, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_HW_ERR_NC_UE, NDS_EXT_NC_COUNTER_HW_ERR_NC_UE), NMETRIC_COUNTER_DEF(22, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_HW_ERR_DMA_ABORT, NDS_EXT_NC_COUNTER_HW_ERR_DMA_ABORT), @@ -212,15 +215,19 @@ static const nmetric_def_t nmetric_defs[] = { NMETRIC_COUNTER_DEF(25, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_SW_EVENT_ERROR, NDS_EXT_NC_COUNTER_ERR_SW_EVENT_ERROR), NMETRIC_COUNTER_DEF(26, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_SW_PSUM_COLLISION, NDS_EXT_NC_COUNTER_ERR_SW_PSUM_COLLISION), NMETRIC_COUNTER_DEF(27, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_SW_SEQUENCER_FATAL, NDS_EXT_NC_COUNTER_ERR_SW_SEQUENCER_FATAL), - NMETRIC_COUNTER_DEF(28, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_HW_ERR_REPAIRABLE_HBM_UE, NDS_EXT_NC_COUNTER_HW_ERR_REPAIRABLE_HBM_UE), NMETRIC_UTILIZATION_DEF(29, POST_TIME_ALWAYS, NMETRIC_CW_ID_NC_UTILIZATION, NDS_NC_COUNTER_TIME_IN_USE), + // ECC Error Count Metrics + NMETRIC_DRIVER_ECC_ERR_DEF(0, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_HW_ERR_HBM_UE), + NMETRIC_DRIVER_ECC_ERR_DEF(1, POST_TIME_TICK_1, NMETRIC_CW_ID_NERR_HW_ERR_REPAIRABLE_HBM_UE), + // bitmap metrics NMETRIC_BITMAP_DEF(0, POST_TIME_TICK_1, NMETRIC_CW_ID_FEATURE_BITMAP, NDS_ND_COUNTER_FEATURE_BITMAP), NMETRIC_BITMAP_DEF(0, POST_TIME_TICK_1, NMETRIC_CW_ID_UNUSED, NDS_ND_COUNTER_DYNAMIC_SYSFS_METRIC_BITMAP), // const uint64 metrics NMETRIC_CONSTANT_U64(0, POST_TIME_TICK_1, NMETRIC_CW_ID_DEVICE_CLUSTER_ID, NDS_ND_COUNTER_DEVICE_CLUSTER_ID, NMETRIC_CONST_U64_FLAG_SKIP_ZERO), + NMETRIC_CONSTANT_U64(1, POST_TIME_TICK_1, NMETRIC_CW_ID_AGG_NEFF_ID, NDS_ND_COUNTER_AGG_NEFF_ID, NMETRIC_CONST_U64_FLAG_SKIP_ZERO), // driver metrics. not in datastore NMETRIC_DRIVER_DEF(NMETRIC_DRIVER_METRICS_IDX_MAX_DEVICE_RESET_TIME_MS, POST_TIME_TICK_1, NMETRIC_CW_ID_MAX_DEVICE_RESET_TIME_MS), @@ -649,10 +656,10 @@ static inline int nmetric_post_feature_bitmap(const nmetric_def_t *metric, struc return metric_size; } -static int nmetric_post_u64(const nmetric_def_t *metric, u64 metric_value, struct nmetric_cw_metric *dest, int available_size) +static int nmetric_post_u64_fmt(const nmetric_def_t *metric, const char *format, u64 metric_value, struct nmetric_cw_metric *dest, int available_size) { // check if there is enough space in buffer - int expected_len = snprintf(NULL, 0, "%llu", metric_value); + int expected_len = snprintf(NULL, 0, format, metric_value); int metric_size = sizeof(struct nmetric_cw_metric) + expected_len; if (available_size < metric_size) { return 0; @@ -661,12 +668,12 @@ static int nmetric_post_u64(const nmetric_def_t *metric, u64 metric_value, struc // save metrics to buffer dest->id = metric->cw_id; dest->len = expected_len; - snprintf(dest->data, expected_len + 1, "%llu", metric_value); // post the as decimal not hex, as cw reads it in decimal format + snprintf(dest->data, expected_len + 1, format, metric_value); return metric_size; } -static inline int nmetric_post_constant_u64(const nmetric_def_t *metric, struct nmetric_cw_metric *dest, u64 *const_u64_metrics, u64 *freed_const_u64_metrics, int available_size) +static inline int nmetric_post_constant_u64_fmt(const nmetric_def_t *metric, const char *format, u64 *const_u64_metrics, u64 *freed_const_u64_metrics, struct nmetric_cw_metric *dest, int available_size) { // we have a choice of taking the metric value from previous // NDS or current NDS. @@ -687,7 +694,17 @@ static inline int nmetric_post_constant_u64(const nmetric_def_t *metric, struct return 0; } - return nmetric_post_u64(metric, metric_value, dest, available_size); + return nmetric_post_u64_fmt(metric, format, metric_value, dest, available_size); +} + +static inline int nmetric_post_decimal_constant_u64(const nmetric_def_t *metric, struct nmetric_cw_metric *dest, u64 *const_u64_metrics, u64 *freed_const_u64_metrics, int available_size) +{ + return nmetric_post_constant_u64_fmt(metric, "%llu", const_u64_metrics, freed_const_u64_metrics, dest, available_size); +} + +static inline int nmetric_post_hex_constant_u64(const nmetric_def_t *metric, struct nmetric_cw_metric *dest, u64 *const_u64_metrics, u64 *freed_const_u64_metrics, int available_size) +{ + return nmetric_post_constant_u64_fmt(metric, "%llx", const_u64_metrics, freed_const_u64_metrics, dest, available_size); } // TODO: This function is a quick workaround to post and reset the driver metrics: @@ -721,7 +738,7 @@ static inline int nmetric_post_and_reset_driver_metrics(const nmetric_def_t *dri metric_value = total_time / total_count; } - return nmetric_post_u64(driver_final_metric, metric_value, dest, available_size); + return nmetric_post_u64_fmt(driver_final_metric, "%llu", metric_value, dest, available_size); } static inline int nmetric_post_driver_userver_metrics(const nmetric_def_t *metric, struct nmetric_cw_metric *dest, int available_size) @@ -754,7 +771,51 @@ static inline int nmetric_post_driver_userver_metrics(const nmetric_def_t *metri metric_value = mode; } - return nmetric_post_u64(metric, metric_value, dest, available_size); + return nmetric_post_u64_fmt(metric, "%llu", metric_value, dest, available_size); +} + +/** + * Function for updating the ECC memory error counts in the driver. Uses the same parsing logic for the ECC miscram registers as the sysfs + * module to ensure data consistency. + * + * @param metric Current metric to be posted + * @param dest The destination buffer to write the TVL metric data into + * @param available_size The remaining size in the dest buffer + * + * @return Size of the metric posting when appended to the buffer + */ +static inline int nmetric_post_driver_ecc_metrics(struct neuron_device *nd, const nmetric_def_t *metric, + struct nmetric_cw_metric *dest, int available_size) +{ + uint32_t metric_value = 0; + + // Read the current value of the hbm_err_count registers in miscram using the same function as sysfs for consistency + switch (metric->cw_id) { + case NMETRIC_CW_ID_NERR_HW_ERR_HBM_UE: + ndhal->ndhal_sysfs_metrics.nsysfsmetric_get_hbm_error_count(nd, false, &metric_value); + break; + case NMETRIC_CW_ID_NERR_HW_ERR_REPAIRABLE_HBM_UE: + ndhal->ndhal_sysfs_metrics.nsysfsmetric_get_hbm_error_count(nd, true, &metric_value); + break; + default: + pr_err_once("Unrecognized ECC Metric ID %d. Skipping parsing metric", metric->cw_id); + return 0; + break; + } + + // Subtract out previous errors during this session e.g. we get HBM UEs but do not degrade the node. Prevents double counting errors. + // In the case we detect an underflow, record the metric as 0 and set ecc_prev to the current register value. This is mostly to combat + // the case where Pacific has a bug in register writing, or resets the chip underneath us. + if (nd->metrics.neuron_aggregation.ecc_prev[metric->index] <= metric_value) { + metric_value -= nd->metrics.neuron_aggregation.ecc_prev[metric->index]; + nd->metrics.neuron_aggregation.ecc_prev[metric->index] += metric_value; + } else { + pr_warn_once("Integer underflow detected when parsing HBM UE metrics. Adjusting stats to avoid an overcount."); + nd->metrics.neuron_aggregation.ecc_prev[metric->index] = metric_value; + metric_value = 0; + } + + return nmetric_post_u64_fmt(metric, "%llu", metric_value, dest, available_size); } /** @@ -793,33 +854,40 @@ static void nmetric_post_metrics(struct neuron_device *nd, u64 *curr_metrics, u6 } dest = (struct nmetric_cw_metric *)&nd->metrics.posting_buffer[data_size]; switch(curr_metric->type) { - case NMETRIC_TYPE_CONSTANT: - data_size += nmetric_post_constant(curr_metric, dest, available_size); - break; - case NMETRIC_TYPE_VERSION: - data_size += nmetric_post_version(versions, curr_metric, dest, available_size); - break; - case NMETRIC_TYPE_UTILIZATION: - data_size += nmetric_post_utilization(nd, curr_metrics, prev_metrics, freed_metrics, - curr_metric, dest, available_size); - break; - case NMETRIC_TYPE_COUNTER: - case NMETRIC_TYPE_FW_IO_ERR: - data_size += nmetric_post_counter(curr_metrics, prev_metrics, freed_metrics, - curr_metric, dest, available_size); - break; - case NMETRIC_TYPE_BITMAP: - data_size += nmetric_post_feature_bitmap(curr_metric, dest, curr_feature_bitmap, freed_feature_bitmap, available_size); - break; - case NMETRIC_TYPE_CONSTANT_U64: - data_size += nmetric_post_constant_u64(curr_metric, dest, const_u64_metrics, freed_const_u64_metrics, available_size); - break; - case NMETRIC_TYPE_DRIVER_RESET: - data_size += nmetric_post_and_reset_driver_metrics(curr_metric, dest, &nd->metrics.driver_metrics, available_size); - break; - case NMETRIC_TYPE_DRIVER_USERVER: - data_size += nmetric_post_driver_userver_metrics(curr_metric, dest, available_size); - break; + case NMETRIC_TYPE_CONSTANT: + data_size += nmetric_post_constant(curr_metric, dest, available_size); + break; + case NMETRIC_TYPE_VERSION: + data_size += nmetric_post_version(versions, curr_metric, dest, available_size); + break; + case NMETRIC_TYPE_UTILIZATION: + data_size += nmetric_post_utilization(nd, curr_metrics, prev_metrics, freed_metrics, + curr_metric, dest, available_size); + break; + case NMETRIC_TYPE_COUNTER: + case NMETRIC_TYPE_FW_IO_ERR: + data_size += nmetric_post_counter(curr_metrics, prev_metrics, freed_metrics, + curr_metric, dest, available_size); + break; + case NMETRIC_TYPE_BITMAP: + data_size += nmetric_post_feature_bitmap(curr_metric, dest, curr_feature_bitmap, freed_feature_bitmap, available_size); + break; + case NMETRIC_TYPE_CONSTANT_U64: + if (curr_metric->cw_id == NMETRIC_CW_ID_AGG_NEFF_ID) { + data_size += nmetric_post_hex_constant_u64(curr_metric, dest, const_u64_metrics, freed_const_u64_metrics, available_size); + } else { + data_size += nmetric_post_decimal_constant_u64(curr_metric, dest, const_u64_metrics, freed_const_u64_metrics, available_size); + } + break; + case NMETRIC_TYPE_DRIVER_RESET: + data_size += nmetric_post_and_reset_driver_metrics(curr_metric, dest, &nd->metrics.driver_metrics, available_size); + break; + case NMETRIC_TYPE_DRIVER_USERVER: + data_size += nmetric_post_driver_userver_metrics(curr_metric, dest, available_size); + break; + case NMETRIC_TYPE_ECC_ERR_COUNTER: + data_size += nmetric_post_driver_ecc_metrics(nd, curr_metric, dest, available_size); + break; } } @@ -996,6 +1064,7 @@ static int nmetric_thread_fn(void *arg) memset(nd->metrics.neuron_aggregation.prev, 0, nmetric_counters_buf_size); memset(nd->metrics.neuron_aggregation.curr, 0, nmetric_counters_buf_size); memset(nd->metrics.neuron_aggregation.freed, 0, nmetric_counters_buf_size); + memset(nd->metrics.neuron_aggregation.ecc_prev, 0, nmetric_ecc_err_buf_size); memset(component_versions, 0, nmetric_versions_buf_size); curr_feature_bitmap = 0; freed_feature_bitmap = 0; diff --git a/neuron_metrics.h b/neuron_metrics.h index 60fbb61..59585ad 100644 --- a/neuron_metrics.h +++ b/neuron_metrics.h @@ -29,6 +29,7 @@ #define NMETRIC_TYPE_DRIVER_RESET 0x6 #define NMETRIC_TYPE_DRIVER_USERVER 0x7 #define NMETRIC_TYPE_UTILIZATION 0x8 +#define NMETRIC_TYPE_ECC_ERR_COUNTER 0x9 #define NMETRIC_FLAG_VERS_ALLOW_TYPE (1) @@ -81,7 +82,10 @@ struct nmetric_driver_metrics { #define NMETRIC_BITMAP_COUNT 1 // Number of metrics of type NMETRIC_CONSTANT_U64 -#define NMETRIC_CONSTANT_U64_COUNT 1 +#define NMETRIC_CONSTANT_U64_COUNT 2 + +// Number of metrics of type NMETRIC_TYPE_ECC_ERR_COUNTER +#define NMETRIC_ECC_ERR_COUNT 3 typedef struct { u8 index; // metric specific index @@ -104,6 +108,7 @@ typedef struct { #define NMETRIC_CONSTANT_U64(idx, tick, cw_id, ds_id, flags) NMETRIC_DEF(idx, NMETRIC_TYPE_CONSTANT_U64, 1, tick, cw_id, ds_id, flags) #define NMETRIC_DRIVER_DEF(idx, tick, cw_id) NMETRIC_DEF(idx, NMETRIC_TYPE_DRIVER_RESET, 1, tick, cw_id, 0xFF, 0) #define NMETRIC_DRIVER_USERVER_DEF(idx, tick, cw_id) NMETRIC_DEF(idx, NMETRIC_TYPE_DRIVER_USERVER, 1, tick, cw_id, 0xFF, 0) +#define NMETRIC_DRIVER_ECC_ERR_DEF(idx, tick, cw_id) NMETRIC_DEF(idx, NMETRIC_TYPE_ECC_ERR_COUNTER, 1, tick, cw_id, 0xFF, 0) struct nmetric_versions { u32 version_usage_count[NEURON_METRICS_VERSION_MAX_CAPACITY]; @@ -126,6 +131,7 @@ struct nmetric_aggregation_thread { u64 curr[NMETRIC_COUNTER_COUNT]; // metrics for the current session so far u64 prev[NMETRIC_COUNTER_COUNT]; // recorded metrics from the last post u64 freed[NMETRIC_COUNTER_COUNT]; // cache holding metrics that were freed before the posting period was reached + u64 ecc_prev[NMETRIC_ECC_ERR_COUNT]; // ECC error counts up to the current post }; struct neuron_metrics { diff --git a/neuron_module.c b/neuron_module.c index 0b257b3..56713ba 100644 --- a/neuron_module.c +++ b/neuron_module.c @@ -18,13 +18,13 @@ #include "neuron_cdev.h" #include "neuron_pci.h" -MODULE_DESCRIPTION("Neuron Driver, built from SHA: 6670442319042643165ab7986e5184496ea4407c"); +MODULE_DESCRIPTION("Neuron Driver, built from SHA: 1c7ed9bd14936635773b5a01777882804ee8ea6e"); MODULE_LICENSE("GPL"); -MODULE_VERSION("2.26.5.0"); +MODULE_VERSION("2.27.4.0"); MODULE_ALIAS("pci:v00001d0fd00007064sv*sd*bc*sc*i*"); -const char driver_version[] = "2.26.5.0"; -const char driver_revision[] = "6670442319042643165ab7986e5184496ea4407c"; +const char driver_version[] = "2.27.4.0"; +const char driver_revision[] = "1c7ed9bd14936635773b5a01777882804ee8ea6e"; #ifdef CONFIG_FAULT_INJECTION diff --git a/neuron_nq.h b/neuron_nq.h index 6bfd7e3..e956a1a 100644 --- a/neuron_nq.h +++ b/neuron_nq.h @@ -10,6 +10,48 @@ #include #include "neuron_device.h" +#define NOTIFIC_NQ_SIZE 0x28 // total size of the NQ register space +#define NOTIFIC_NQ_BASE_ADDR_LO_OFFSET_START 0x100 +#define NOTIFIC_NQ_BASE_ADDR_LO_OFFSET(index) (NOTIFIC_NQ_BASE_ADDR_LO_OFFSET_START + ((index)*NOTIFIC_NQ_SIZE) + 0) + +#define NOTIFIC_NQ_BASE_ADDR_LO_RESET_VALUE 0x00000000 + +static inline void notific_write_nq_base_addr_lo(void __iomem *base, size_t index, + uint32_t value) +{ + const size_t offset = NOTIFIC_NQ_BASE_ADDR_LO_OFFSET(index); + + reg_write32(base + offset, value); +} + +#define NOTIFIC_NQ_BASE_ADDR_HI_OFFSET_START 0x104 +#define NOTIFIC_NQ_BASE_ADDR_HI_OFFSET(index) (NOTIFIC_NQ_BASE_ADDR_HI_OFFSET_START + ((index)*NOTIFIC_NQ_SIZE) + 0) + +#define NOTIFIC_NQ_BASE_ADDR_HI_RESET_VALUE 0x00000000 + +static inline void notific_write_nq_base_addr_hi(void __iomem *base, size_t index, + uint32_t value) +{ + const size_t offset = NOTIFIC_NQ_BASE_ADDR_HI_OFFSET(index); + + reg_write32(base + offset, value); +} + +#define NOTIFIC_NQ_F_SIZE_OFFSET_START 0x108 +#define NOTIFIC_NQ_F_SIZE_OFFSET(index) (NOTIFIC_NQ_F_SIZE_OFFSET_START + ((index)*NOTIFIC_NQ_SIZE) + 0) + +#define NOTIFIC_F_SIZE_RESET_VALUE 0x00000000 + +static inline void notific_write_nq_f_size(void __iomem *base, size_t index, + uint32_t value) +{ + const size_t offset = NOTIFIC_NQ_F_SIZE_OFFSET(index); + + reg_write32(base + offset, value); +} + +#define NOTIFIC_NQ_HEAD_OFFSET 0x10c + /** * nnq_init() - Initialize notification queue for NeuronCore * diff --git a/neuron_pci.c b/neuron_pci.c index dbb1b14..f385b3d 100644 --- a/neuron_pci.c +++ b/neuron_pci.c @@ -57,6 +57,8 @@ static atomic_t device_count = ATOMIC_INIT(0); struct neuron_device *neuron_devices[MAX_NEURON_DEVICE_COUNT] = { 0 }; int total_neuron_devices = 0; +extern unsigned int nmetric_log_posts; + extern void ndmar_preinit(struct neuron_device *nd); struct neuron_device *neuron_pci_get_device(u8 device_index) @@ -65,6 +67,45 @@ struct neuron_device *neuron_pci_get_device(u8 device_index) return neuron_devices[device_index]; } +static atomic_t dup_rid_cnt = ATOMIC_INIT(0); // count of duplicate routing IDs encountered +int neuron_pci_handle_dup_routing_id(void) { + int ret = -ENODEV; + int dup_cnt; + char cmd[256]; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) + dup_cnt = atomic_fetch_add(1, &dup_rid_cnt); +#else + dup_cnt = atomic_add_return(1, &dup_rid_cnt) - 1; +#endif + + // If this is the first dup encounted, unload the driver + if ((dup_cnt == 0) && dup_helper_enable) { + pr_err("scheduling unload of %s due to duplicate routing id\n", module_name(THIS_MODULE)); + + int n = snprintf(cmd, sizeof(cmd), "sleep 10;/sbin/modprobe -r %s", module_name(THIS_MODULE)); + if (n >= sizeof(cmd)) { + pr_err("unable to schedule driver unload cmd buffer len exceeded\n"); + return -EINVAL; + } + char *argv[] = { "/bin/sh", + "-c", + cmd, + NULL}; + static char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL}; + + ret = call_usermodehelper( argv[0], argv, envp, UMH_WAIT_EXEC); + if (ret) { + pr_err("unable to schedule driver unload. Error: %d\n", ret); + } + } + + return ret; +} + static int neuron_pci_device_init(struct neuron_device *nd) { int i, ret; @@ -102,7 +143,7 @@ static int neuron_pci_device_init(struct neuron_device *nd) goto fail_mch; // Initialize the device mpset - memset(&nd->mpset, 0, sizeof(struct mempool_set)); + memset(&nd->mpset, 0, sizeof(struct neuron_mempool_set)); // Initialize the host portion in mpset ret = mpset_constructor(&nd->mpset, &(nd->pdev->dev), nd); @@ -113,6 +154,8 @@ static int neuron_pci_device_init(struct neuron_device *nd) for (i = 0; i < MAX_NC_PER_DEVICE; i++) mutex_init(&nd->crwl[i].lock); + nd->supports_hbm_7200 = -1; + ret = ncdev_create_device_node(nd); if (ret) { pci_info(nd->pdev, "create device node failed\n"); @@ -132,7 +175,6 @@ static int neuron_pci_device_init(struct neuron_device *nd) fail_mch: if (nd->fw_io_ctx) fw_io_destroy((struct fw_io_ctx *)nd->fw_io_ctx); - nd->fw_io_ctx = NULL; return ret; } @@ -185,12 +227,134 @@ static void neuron_pci_set_device_architecture(struct neuron_device *nd) narch_init(arch, revision); } +static bool is_valid_bar(int bar) { + return ((bar == ndhal->ndhal_pci.apb_bar) || (bar == ndhal->ndhal_pci.axi_bar) || (bar == ndhal->ndhal_pci.dram_bar)); +} + +/** + * neuron_pci_reserve_bar() - Mark the PCI region associated with PCI BAR as being reserved + * + * @param dev: PCI device whose resources are to be reserved + * @param bar: BAR to be reserved + * @param res_name: Name to be associated with resource. + * @return int: Returns 0 on success, otherwise failure + */ +static int neuron_pci_reserve_bar(struct pci_dev *dev, int bar, const char *res_name) { + int ret; + + if (!is_valid_bar(bar)) { + pci_info(dev, "invalid BAR%d\n", bar); + goto err; + } + if (bar == BAR_UNUSED) { + return 0; + } + + ret = pci_request_region(dev, bar, res_name); + if (ret) { + pci_info(dev, "BAR %d: can't reserve %s\n", bar, res_name); + goto err; + } + + return 0; + +err: + // allow failure to map on dram bar, as some setups may not support it + if (bar == ndhal->ndhal_pci.dram_bar) { + return 0; + } else { + return -ENODEV; + } +} + + /** + * neuron_pci_set_npdev() - set BAR's physical addr, io addr, and size of neuron_pci_device + * + * @param dev: PCI device that owns the BAR + * @param bar: BAR number + * @param res_name: Name associated with resource + * @param bar_pa: start physical address of BAR + * @param bar_ioaddr: __iomem address to device BAR + * @param bar_size: size of BAR + * @return int: Returns 0 on success, otherwise failure + */ +static int neuron_pci_set_npdev(struct pci_dev *dev, + int bar, + const char *res_name, + phys_addr_t *bar_pa, + void __iomem **bar_ioaddr, + u64 *bar_size) { + if (!is_valid_bar(bar)) { + pci_info(dev, "invalid BAR%d\n", bar); + return -ENODEV; + } + if (bar == BAR_UNUSED) { + return 0; + } + + if (pci_resource_len(dev, bar) == 0) { + pci_info(dev, "BAR%d len is 0\n", bar); + goto err; + } + + *bar_pa = pci_resource_start(dev, bar); + if (!(*bar_pa)) { + pci_info(dev, "Can't get start address of BAR%d %s\n", bar, res_name); + goto err; + } + *bar_size = pci_resource_len(dev, bar); + + if (bar == ndhal->ndhal_pci.dram_bar) { + ndhal->ndhal_pci.dram_bar_size = *bar_size; + } + + if (bar == ndhal->ndhal_pci.dram_bar && wc_enable) { + *bar_ioaddr = pci_iomap_wc(dev, bar, pci_resource_len(dev, bar)); + } + else { + *bar_ioaddr = pci_iomap(dev, bar, pci_resource_len(dev, bar)); + } + + return 0; + +err: + // allow failure to map on dram bar, as some setups may not support it + if (bar == ndhal->ndhal_pci.dram_bar) { + *bar_pa = 0; + *bar_size = 0; + *bar_ioaddr = NULL; + return 0; + } else { + return -ENODEV; + } +} + +/** + * neuron_pci_release_bar() - Release a PCI BAR + * + * @param dev: PCI device whose resources were previously reserved by pci_request_region() + * @param bar: BAR to be reserved + * + */ +static int neuron_pci_release_bar(struct pci_dev *dev, int bar) { + if (!is_valid_bar(bar)) { + pci_info(dev, "invalid BAR%d\n", bar); + return -ENODEV; + } + if (bar == BAR_UNUSED) { + return 0; + } + + pci_release_region(dev, bar); + return 0; +} + static int neuron_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) { int ret = 0; struct neuron_device *nd; - nd = kzalloc(sizeof(struct neuron_device), GFP_KERNEL); + nd = kvzalloc(sizeof(struct neuron_device), GFP_KERNEL); if (nd == NULL) { pci_info(dev, "Can't allocate memory for neuron_device\n"); goto fail_alloc_nd_mem; @@ -223,31 +387,31 @@ static int neuron_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) } // map apb bar - ret = ndhal->ndhal_pci.neuron_pci_reserve_bar(dev, ndhal->ndhal_pci.apb_bar, "APB"); + ret = neuron_pci_reserve_bar(dev, ndhal->ndhal_pci.apb_bar, "APB"); if (ret) { goto fail_bar0_map; } - ret = ndhal->ndhal_pci.neuron_pci_set_npdev(dev, ndhal->ndhal_pci.apb_bar, "APB", &nd->npdev.bar0_pa, &nd->npdev.bar0, &nd->npdev.bar0_size); + ret = neuron_pci_set_npdev(dev, ndhal->ndhal_pci.apb_bar, "APB", &nd->npdev.bar0_pa, &nd->npdev.bar0, &nd->npdev.bar0_size); if (ret) { goto fail_bar0_resource; } // map bar2 - ret = ndhal->ndhal_pci.neuron_pci_reserve_bar(dev, ndhal->ndhal_pci.axi_bar, "AXI"); + ret = neuron_pci_reserve_bar(dev, ndhal->ndhal_pci.axi_bar, "AXI"); if (ret) { goto fail_bar2_map; } - ret = ndhal->ndhal_pci.neuron_pci_set_npdev(dev, ndhal->ndhal_pci.axi_bar, "AXI", &nd->npdev.bar2_pa, &nd->npdev.bar2, &nd->npdev.bar2_size); + ret = neuron_pci_set_npdev(dev, ndhal->ndhal_pci.axi_bar, "AXI", &nd->npdev.bar2_pa, &nd->npdev.bar2, &nd->npdev.bar2_size); if (ret) { goto fail_bar2_resource; } // map bar4 - ret = ndhal->ndhal_pci.neuron_pci_reserve_bar(dev, ndhal->ndhal_pci.dram_bar, "BAR4"); + ret = neuron_pci_reserve_bar(dev, ndhal->ndhal_pci.dram_bar, "BAR4"); if (ret) { goto fail_bar4_map; } - ret = ndhal->ndhal_pci.neuron_pci_set_npdev(dev, ndhal->ndhal_pci.dram_bar, "BAR4", &nd->npdev.bar4_pa, &nd->npdev.bar4, &nd->npdev.bar4_size); + ret = neuron_pci_set_npdev(dev, ndhal->ndhal_pci.dram_bar, "BAR4", &nd->npdev.bar4_pa, &nd->npdev.bar4, &nd->npdev.bar4_size); if (ret) { goto fail_bar4_resource; } @@ -290,10 +454,12 @@ static int neuron_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) if (ret) goto fail_memset_mc; - // initialize metric aggregation and posting - ret = nmetric_init(nd); - if (ret) - goto fail_nmetric_resource; + if (nmetric_log_posts != 0) { + // initialize metric aggregation and posting + ret = nmetric_init(nd); + if (ret) + goto fail_nmetric_resource; + } mutex_init(&nd->memset_lock); @@ -310,19 +476,19 @@ static int neuron_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) fail_nds_resource: neuron_ds_destroy(&nd->datastore); fail_bar4_resource: - ndhal->ndhal_pci.neuron_pci_release_bar(dev, ndhal->ndhal_pci.dram_bar); + neuron_pci_release_bar(dev, ndhal->ndhal_pci.dram_bar); fail_bar4_map: fail_bar2_resource: - ndhal->ndhal_pci.neuron_pci_release_bar(dev, ndhal->ndhal_pci.axi_bar); + neuron_pci_release_bar(dev, ndhal->ndhal_pci.axi_bar); fail_bar2_map: fail_bar0_resource: - ndhal->ndhal_pci.neuron_pci_release_bar(dev, ndhal->ndhal_pci.apb_bar); + neuron_pci_release_bar(dev, ndhal->ndhal_pci.apb_bar); fail_bar0_map: pci_disable_device(dev); fail_dhal_init: fail_enable: neuron_log_destroy( nd); - kfree(nd); + kvfree(nd); fail_alloc_nd_mem: pci_set_drvdata(dev, NULL); return ret; @@ -342,11 +508,11 @@ static void neuron_pci_remove(struct pci_dev *dev) ndhal->ndhal_ext_cleanup(); - ndhal->ndhal_pci.neuron_pci_release_bar(dev, ndhal->ndhal_pci.apb_bar); + neuron_pci_release_bar(dev, ndhal->ndhal_pci.apb_bar); - ndhal->ndhal_pci.neuron_pci_release_bar(dev, ndhal->ndhal_pci.axi_bar); + neuron_pci_release_bar(dev, ndhal->ndhal_pci.axi_bar); - ndhal->ndhal_pci.neuron_pci_release_bar(dev, ndhal->ndhal_pci.dram_bar); + neuron_pci_release_bar(dev, ndhal->ndhal_pci.dram_bar); pci_disable_device(dev); @@ -364,7 +530,7 @@ static void neuron_pci_remove(struct pci_dev *dev) neuron_log_destroy(nd); - kfree(nd); + kvfree(nd); } static struct pci_driver neuron_pci_driver = { diff --git a/neuron_pci.h b/neuron_pci.h index de95f48..afe7afe 100644 --- a/neuron_pci.h +++ b/neuron_pci.h @@ -25,6 +25,13 @@ extern int wc_enable; */ struct neuron_device *neuron_pci_get_device(u8 device_index); +/** + * neuron_pci_handle_dup_routing_id() - Handle the case where multiple devices share the same routing id + * + * Return: 0 if successful, a negative error code otherwise. + */ +int neuron_pci_handle_dup_routing_id(void); + /** * neuron_pci_module_init() - Initialize Neuron PCI driver. * diff --git a/neuron_power.c b/neuron_power.c index 4fc0fce..58887d5 100644 --- a/neuron_power.c +++ b/neuron_power.c @@ -203,7 +203,7 @@ static void npower_calculate_stats(struct neuron_power_samples *current_samples, min_power_to_log = current_samples->max_power_bips; } if (power_enabled_in_fw) { - pr_info("Not enough data to aggregate stats. Have %u data points, min of %u max of %u, total of %llu.", + pr_debug("Not enough data to aggregate stats. Have %u data points, min of %u max of %u, total of %llu.", current_samples->num_data_points, min_power_to_log, current_samples->max_power_bips, current_samples->total_power_util_bips); diff --git a/neuron_reg_access.c b/neuron_reg_access.c index 7f6c98b..eceac3e 100644 --- a/neuron_reg_access.c +++ b/neuron_reg_access.c @@ -7,7 +7,7 @@ inline int reg_read32(const u32 __iomem *addr, u32 *value) { - return ndhal->ndhal_reg_access.reg_read32_array((void **)&addr, value, 1); + return ndhal->ndhal_fw_io.fw_io_read_csr_array((void **)&addr, value, 1, true); } inline void reg_write32(u32 __iomem *addr, u32 value) diff --git a/neuron_reset.c b/neuron_reset.c index 1794ba4..ff7b3a6 100644 --- a/neuron_reset.c +++ b/neuron_reset.c @@ -25,9 +25,13 @@ int no_reset = 0; module_param(no_reset, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); MODULE_PARM_DESC(no_reset, "Dont reset device"); +int reset_top_dma = 0; +module_param(reset_top_dma, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); +MODULE_PARM_DESC(reset_top_dma, "Reset top-level DMAs during TPB reset"); + #define NR_DEVICE_RESET_RETRY_INTERVAL 30000 // millisecond #define NR_TPB_RESET_RETRY_INTERVAL 10000 // millisecond - +#define NR_RESET_POLL_INTERVAL 100 // millisecond /** * ITER_COAL_REQS - iterate over coalesced reset requests @@ -61,7 +65,7 @@ int nr_msleep_stoppable(struct neuron_device *nd, uint32_t msec) static int nr_call_post_reset_config(struct neuron_device *nd, uint32_t nc_map, bool reset_succeeded) { if (nc_map == NEURON_NC_MAP_DEVICE) { - return ndhal->ndhal_reset.nr_post_reset_config(nd, reset_succeeded); + return ndhal->ndhal_reset.nr_post_reset_config(nd, reset_succeeded, no_reset); } return 0; } @@ -94,7 +98,7 @@ static int nr_reset_thread_fn(void *arg) if (first_request->request_id != NEURON_RESET_REQUEST_ALL && first_request->next != NULL) { ITER_COAL_REQS(request_iter, first_request, last_request, { coal_cnt++; - nc_map |= request_iter->nc_map; + nc_map |= request_iter->nc_map; }) } else { last_request = first_request; @@ -374,7 +378,7 @@ bool nr_op_in_reset_wnd(uint64_t op_start_time, struct neuron_device *nd) return false; } -int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t tpb_reset_map) +int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t tpb_reset_map_lo, uint32_t tpb_reset_map_hi) { bool is_device_reset; uint32_t reset_retry_interval; @@ -391,7 +395,7 @@ int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t start_time = ktime_get(); /* Send reset request to firmware */ - fw_io_initiate_reset(nd->npdev.bar0, is_device_reset, tpb_reset_map); + fw_io_initiate_reset(nd->npdev.bar0, is_device_reset, tpb_reset_map_lo, tpb_reset_map_hi); next_reset_retry_time = ktime_add_ms(start_time, reset_retry_interval); do { @@ -399,7 +403,7 @@ int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t * After reset initiation, firmware becomes unresponsive until * the device completes the reset. Wait before next polling cycle. */ - if (nr_msleep_stoppable(nd, ndhal->ndhal_reset.reset_poll_interval)) { + if (nr_msleep_stoppable(nd, NR_RESET_POLL_INTERVAL)) { return -EINTR; } @@ -421,7 +425,7 @@ int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t * If timed out, retry the reset. * This handles cases where the initial/previous reset was missed. */ - fw_io_initiate_reset(nd->npdev.bar0, is_device_reset, tpb_reset_map); + fw_io_initiate_reset(nd->npdev.bar0, is_device_reset, tpb_reset_map_lo, tpb_reset_map_hi); next_reset_retry_time = ktime_add_ms(cur_time, reset_retry_interval); } diff --git a/neuron_reset.h b/neuron_reset.h index 12a6ebc..afac5a5 100644 --- a/neuron_reset.h +++ b/neuron_reset.h @@ -113,12 +113,13 @@ bool nr_op_in_reset_wnd(uint64_t op_start_time, struct neuron_device *nd); * * @nd: Neuron device structure * @nc_map: Neural Core map that specifies reset scope (device vs TPB level) - * @tpb_reset_map: Bitmap of TPBs to reset + * @tpb_reset_map_lo: Bitmap of TPBs/SDMA/TopSp/CC_TOP to reset (bits 0-31) + * @tpb_reset_map_hi: Bitmap of top-level H2D DMAs to reset (bits 0-3) * * @return: 0 on success, -1 on failure or interruption * */ -int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t tpb_reset_map); +int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t tpb_reset_map_lo, uint32_t tpb_reset_map_hi); /** * nr_msleep_stoppable() - Sleep until msec or reset thread is stopped diff --git a/neuron_ring.c b/neuron_ring.c index 0c8420c..280e961 100644 --- a/neuron_ring.c +++ b/neuron_ring.c @@ -31,7 +31,7 @@ module_param(dma_teardown_on_exit, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); MODULE_PARM_DESC(dma_teardown_on_exit, "Reset the DMA state on user process exit"); // forward -static void ndmar_h2t_ring_free(struct ndma_ring *ring); +static void ndmar_h2t_ring_free(struct ndma_eng *eng, struct ndma_ring *ring); struct ndma_eng *ndmar_acquire_engine(struct neuron_device *nd, u32 eng_id) { @@ -150,7 +150,7 @@ int ndmar_queue_init(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx_desc_ if (eng == NULL) return -EINVAL; - if (qid >= DMA_MAX_Q_V4) { + if (qid >= ndhal->ndhal_udma.num_queues) { ret = -EINVAL; goto done; } @@ -220,7 +220,7 @@ void ndmar_handle_process_exit(struct neuron_device *nd, pid_t pid) struct mem_chunk *mc = nd->ndma_q_dummy_mc; const int desc_count = NDMA_QUEUE_DUMMY_RING_DESC_COUNT; for (eng_id = 0; eng_id < ndhal->ndhal_address_map.seng_dma_eng_per_nd; eng_id++) { - for (qid = 0; qid < DMA_MAX_Q_MAX; qid++) { + for (qid = 0; qid < ndhal->ndhal_udma.num_queues; qid++) { struct ndma_eng *eng = ndmar_acquire_engine_nl(nd, eng_id); struct ndma_queue *queue; struct ndma_ring *ring; @@ -393,12 +393,26 @@ static int ndmar_h2t_ring_alloc(struct neuron_device *nd, int nc_id, int qid) ring->h2t_completion.ptr = h2t_completion_mc->va; ring->h2t_completion.addr = virt_to_phys(ring->h2t_completion.ptr) | ndhal->ndhal_address_map.pci_host_base; + ret = ndma_h2d_compl_queue_init(nd, &ring->dma_compl_queue); + if (ret) { + pr_err("can't initialize h2d dma completion queue\n"); + goto error; + } + + ret = ndma_ctx_queue_init(&ring->dma_ctx_queue); + if (ret) { + pr_err("can't initialize dma context queue\n"); + goto error_ctx_queue; + } + mutex_init(&ring->h2t_ring_lock); ndmar_release_engine(eng); return 0; +error_ctx_queue: + ndma_h2d_compl_queue_destroy(&ring->dma_compl_queue); error: ring->h2t_nc_id = -1; ring->tx_mc = NULL; @@ -469,7 +483,7 @@ int ndmar_h2t_ring_request(struct neuron_device *nd, int nc_id, bool h2t, int *r if (eng == NULL) return -EINVAL; - for (qid = 0; qid < DMA_MAX_Q_MAX; qid++) { + for (qid = 0; qid < ndhal->ndhal_udma.num_queues; qid++) { if (ndhal->ndhal_ndmar.ndmar_is_h2t_def_q(nd, eng_id, qid)) continue; queue = ndmar_get_queue(eng, qid); @@ -491,7 +505,7 @@ int ndmar_h2t_ring_request(struct neuron_device *nd, int nc_id, bool h2t, int *r } ret = ndmar_h2t_ring_init(eng, qid); if (ret) { - ndmar_h2t_ring_free(ring); + ndmar_h2t_ring_free(eng, ring); pr_err("nd%d:nc%d H2T ring init for qid:%d failed - %d\n", nd->device_index, nc_id, qid, ret); ring->h2t_allocated = false; goto done; @@ -519,7 +533,7 @@ int ndmar_h2t_ring_release(struct neuron_device *nd, int nc_id, int qid) struct ndma_queue *queue; struct ndma_ring *ring; - if (qid >= DMA_MAX_Q_MAX) { + if (qid >= ndhal->ndhal_udma.num_queues) { return -EINVAL; } @@ -542,7 +556,7 @@ int ndmar_h2t_ring_release(struct neuron_device *nd, int nc_id, int qid) } if (ndmar_h2t_ring_is_h2t(ring)) { - ndmar_h2t_ring_free(ring); + ndmar_h2t_ring_free(eng, ring); } else { ndmar_h2t_ring_state_clr(ring); queue->owner = 0; @@ -768,7 +782,7 @@ int ndmar_init(struct neuron_device *nd) return ndmar_init_ncs(nd, -1); } -static void ndmar_h2t_ring_free(struct ndma_ring *ring) +static void ndmar_h2t_ring_free(struct ndma_eng *eng, struct ndma_ring *ring) { if (ring->tx_mc) { mc_free(&ring->tx_mc); @@ -790,6 +804,9 @@ static void ndmar_h2t_ring_free(struct ndma_ring *ring) ring->h2t_completion_mc = NULL; } + ndma_ctx_queue_free(eng, ring, &ring->dma_ctx_queue); + ndma_h2d_compl_queue_destroy(&ring->dma_compl_queue); + ndmar_h2t_ring_state_clr(ring); } @@ -810,13 +827,13 @@ static void ndmar_h2t_ring_free_all(struct neuron_device *nd, int nc_idx) return; } - for (qid = 0; qid < DMA_MAX_Q_MAX; qid++) { + for (qid = 0; qid < ndhal->ndhal_udma.num_queues; qid++) { queue = ndmar_get_queue(eng, qid); ring = ndmar_get_ring(queue); if (ndmar_h2t_ring_is_allocated(ring) && ring->h2t_nc_id == nc_idx) { if (ndmar_h2t_ring_is_h2t(ring)) { // h2t queue free all resources - ndmar_h2t_ring_free(ring); + ndmar_h2t_ring_free(eng, ring); } else { // service queue only clear state ndmar_h2t_ring_state_clr(ring); diff --git a/neuron_ring.h b/neuron_ring.h index f031be7..c9d3462 100644 --- a/neuron_ring.h +++ b/neuron_ring.h @@ -7,6 +7,7 @@ #define NEURON_RING_H #include "udma/udma.h" +#include "share/neuron_driver_shared.h" #define DMA_H2T_DESC_COUNT 4096 #define NUM_DMA_ENG_PER_DEVICE 132 // for v2 2 nc with each 16, @@ -19,6 +20,115 @@ extern int nc_per_dev_param; struct neuron_device; struct neuron_dma_eng_state; struct neuron_dma_queue_state; +struct ndma_eng; +struct ndma_ring; + +/* + * H2D DMA Completion Queue (CQ) + * ----------------------------- + * A fixed-size circular buffer shared between driver and runtime, consisting of + * NDMA_H2D_COMPL_QUEUE_CAPACITY Completion Queue Entries (CQEs). + * + * Driver Lib mmaps both the CQ and its metadata (head/tail and capacity) to + * runtime. The driver writes a completion result at tail CQE, while the runtime + * consumes from head CQE and clears it. Each CQE contains a sequence number, + * a completion result, and an opaque context pointer. + * + * Correctness: CQ is SPMC (one driver kthread writes per ND; multiple runtime + * threads read). Runtime must lock; driver is lock-free. Both sides require + * smp_wmb()/smp_rmb() barriers. + * + * Async IO only. + * + * Also see neuron_h2d_dma_compl_queue_t and + * neuron_h2d_dma_compl_queue_entry_t in neuron_driver_shared.h. + * + */ +struct ndma_h2d_compl_queue { + uint32_t capacity_mask; // Capacity mask of the CQ (capacity - 1). + uint32_t tail; // Free-running index of the next free CQE to be written by driver. + // Internally maintained by driver. + struct mem_chunk *mc; // Memchunk for the CQ. + neuron_h2d_dma_compl_queue_t *compl_queue_shared;// the CQ structure mmapped to and shared with user space. +}; + +int ndma_h2d_compl_queue_init(struct neuron_device *nd, struct ndma_h2d_compl_queue *compl_queue); +void ndma_h2d_compl_queue_destroy(struct ndma_h2d_compl_queue *compl_queue); + +/* + * H2D DMA Context Queue + * --------------------- + * A fixed-size circular buffer storing pointers to `ndma_h2t_zcdma_context`. + * The queue tracks DMA context lifecycles (submitted, pinned, or unpinned) + * and maintains four logical indices: + * + * 1. head + * - Index of the first *valid* (non-empty) entry in the queue. + * - The context at `head` may be in any ndma_zcdma_state except + * NDMA_COMPLETED. + * + * 2. tail + * - Index of the next free slot where a new context pointer will be + * inserted. + * + * 3. first_pinned_unsubmitted + * - Index of the earliest context that is pinned but not yet submitted. + * - If no such context exists, this is set to first_unpinned. + * + * 4. first_unpinned + * - Index of the earliest context that is unpinned. + * - If no such context exists, this is set to tail. + * + * Queue state conditions: + * - Empty: head == tail + * - Full: head == ((tail + 1) & (capacity - 1)) + * - Current size: (tail - head + capacity) & (capacity - 1) + * + * Example (capacity = 10): + * + * index: 0 1 2 3 4 5 6 7 8 9 + * entry: [ ] [S] [S] [S] [P] [P] [U] [U] [ ] [ ] + * ^ ^ ^ ^ + * | | | | + * H FPU FU T + * + * Legend: + * H = head + * T = tail + * FPU = first_pinned_unsubmitted + * FU = first_unpinned + * + * [C] = completed + * [S] = submitted + * [P] = pinned but unsubmitted + * [U] = unpinned + * [ ] = empty slot + * + * @entries: Array of DMA context entries in the queue + * @completion_pool: Pre-allocated pool of completion_ptr buffer + * @page_list_pool: Pre-allocated pool of page_list arrays + * @capacity_mask: Maximum number of entries the queue can hold and minus one, + * capacity = capacity_mask + 1 + * @head: Index pointing to the head of the queue + * @tail: Index pointing to the tail of the queue + * @first_pinned_unsubmitted: Index of the first pinned but not yet submitted entry + * @first_unpinned: Index of the first unpinned entry in the queue + * @nr_pinned_pages: Total count of pinned memory pages belonged to the queue + */ +struct ndma_ctx_queue { + struct ndma_h2t_zcdma_context *entries; + void *completion_pool; + void *page_list_pool; + u32 capacity_mask; + u32 head; + u32 tail; + u32 first_pinned_unsubmitted; + u32 first_unpinned; + u64 nr_pinned_pages; +}; + +int ndma_ctx_queue_init(struct ndma_ctx_queue *queue); +void ndma_ctx_queue_free(struct ndma_eng *eng, struct ndma_ring *ring, struct ndma_ctx_queue *queue); /* * dma context for both sync and async DMA operations @@ -49,6 +159,8 @@ struct ndma_ring { struct udma_ring_ptr h2t_completion; // TODO why are we using udma_ring_ptr... struct mem_chunk *h2t_completion_mc; struct ndma_h2t_dma_context h2t_dma_ctx[NEURON_DMA_H2T_CTX_HANDLE_CNT]; + struct ndma_ctx_queue dma_ctx_queue; + struct ndma_h2d_compl_queue dma_compl_queue; u32 h2t_nc_id; bool h2t_allocated; // ring can be allocated for standard use or h2t u32 qid; diff --git a/neuron_sysfs_metrics.c b/neuron_sysfs_metrics.c index 2fd72b3..fd71ae0 100644 --- a/neuron_sysfs_metrics.c +++ b/neuron_sysfs_metrics.c @@ -389,7 +389,7 @@ static ssize_t nsysfsmetric_show_nrt_other_metrics(struct nsysfsmetric_metrics * char buffer[256]; int ret = ndhal->ndhal_tpb.pe_format_activity_stats(nd, attr->nc_id, buffer, sizeof(buffer)); if (ret) { - pr_err("sysfs failed to read pe_array activity counters, error = %d\n", ret); + pr_err_ratelimited("sysfs failed to read pe_array activity counters, error = %d\n", ret); } len = nsysfsmetric_sysfs_emit(buf, "%s", buffer); } else { @@ -942,7 +942,6 @@ int nsysfsmetric_register(struct neuron_device *nd, struct kobject *neuron_devic } // neuron{0, 1, ...}/stats/power - pr_info("Installing neuron power sysfs node\n"); struct nsysfsmetric_node *power_node = nsysfsmetric_init_and_add_one_node(metrics, stats_node, "power", false, -1, power_utilization_attrs_info_tbl_cnt, diff --git a/share/neuron_driver_shared.h b/share/neuron_driver_shared.h index b1f716c..030c19f 100644 --- a/share/neuron_driver_shared.h +++ b/share/neuron_driver_shared.h @@ -210,9 +210,25 @@ typedef struct neuron_memcpy_batch { void *context; // [in] TBD. opaque context pointer passed back in completion queue } neuron_memcpy_batch_t; +/* H2D Completion Queue Entry (CQE) */ +typedef struct neuron_h2d_dma_compl_queue_entry { + __u64 sequence_num; // Sequence number for the submitted IO request from runtime (0 means empty slot). + __s64 compl_ret; // Completion status for the request (0 success; negative errno on failure; positive to be used for future). + void *context; // Opaque context pointer copied from submission and represents a pointer to xu_error_list_t in runtime. +} neuron_h2d_dma_compl_queue_entry_t; + +/* H2D DMA Completion Queue (CQ) */ +typedef struct neuron_h2d_dma_compl_queue { + __u32 capacity; // Capacity of the completion queue (number of CQEs). + __u32 head; // Free-running index of the next CQE to be consumed by runtime. + __u32 tail; // Free-running index of the next free CQE to be written by driver. + // CQEs are laid out immediately after the header in the same mmap region. + neuron_h2d_dma_compl_queue_entry_t entries[]; // offset to the CQE array of the completion queue. +} neuron_h2d_dma_compl_queue_t; + /* * Memory allocation categories for sysfs counters -*/ + */ typedef enum { NEURON_MEMALLOC_TYPE_UNKNOWN_HOST, // only for old runtimes, do not use elsewhere NEURON_MEMALLOC_TYPE_CODE_HOST, @@ -244,16 +260,18 @@ typedef enum { /* * NDS stats * Note: - * To add a new counter type inside the enum, - * 1. you need to manually decrease NDS_ND_COUNTER_RESERVED or NDS_NC_COUNTER_RESERVED by 1 - * 2. you need to update NDS_ND_COUNTER_COUNT or NDS_NC_COUNTER_COUNT - * To prevent compatability issues, you need to always append the new counter type to the end of the enum + * To add a new counter type inside the enum, you need to manually + * decrease NDS_ND_COUNTER_RESERVED or NDS_EXT_NC_COUNTER_ADDED_RESERVED by 1. + * + * To prevent compatability issues, you need to always append the new counter type + * to the end of the enum, before NDS_ND_COUNTER_LAST or NDS_EXT_NC_COUNTER_LAST */ -#define NDS_ND_COUNTER_RESERVED 18 +#define NDS_ND_COUNTER_RESERVED 17 // Device counter types enum { - NDS_ND_COUNTER_RUNTIME_VERSION, + NDS_ND_COUNTER_START = 0, + NDS_ND_COUNTER_RUNTIME_VERSION = NDS_ND_COUNTER_START, NDS_ND_COUNTER_FRAMEWORK_VERSION, NDS_ND_COUNTER_FAL_VERSION, NDS_ND_COUNTER_FEATURE_BITMAP, @@ -270,8 +288,10 @@ enum { NDS_ND_COUNTER_DYNAMIC_SYSFS_METRIC_BITMAP, NDS_ND_COUNTER_DEVICE_CLUSTER_ID, + NDS_ND_COUNTER_AGG_NEFF_ID, + NDS_ND_COUNTER_LAST, - NDS_ND_COUNTER_COUNT = NDS_ND_COUNTER_DEVICE_CLUSTER_ID + NDS_ND_COUNTER_RESERVED + 1 + NDS_ND_COUNTER_COUNT = NDS_ND_COUNTER_LAST + NDS_ND_COUNTER_RESERVED }; #define NDS_NC_COUNTER_RESERVED 0 @@ -329,8 +349,9 @@ enum { NDS_NC_COUNTER_MAC_COUNT, NDS_NC_COUNTER_OOB, + NDS_NC_COUNTER_LAST, - NDS_NC_COUNTER_COUNT = NDS_NC_COUNTER_OOB + NDS_NC_COUNTER_RESERVED + 1 + NDS_NC_COUNTER_COUNT = NDS_NC_COUNTER_LAST + NDS_NC_COUNTER_RESERVED }; #define NDS_MAX_NEURONCORE_COUNT (4) diff --git a/udma/udma.h b/udma/udma.h index 484b82e..51400b7 100644 --- a/udma/udma.h +++ b/udma/udma.h @@ -212,6 +212,28 @@ enum { */ int udma_init(struct udma *udma, struct udma_params *udma_params); +/** + * udma_set_defaults() - set default configuration of one DMA engine + * + * @udma: udma structure needs to be initialized + * + * Return: 0 if UDMA is initialized successfully, a negative error code otherwise. + */ +int udma_set_defaults(struct udma *udma); + +/** + * udma_cache_defaults() - Cache frequently used CSR values. + * + * CSR reads are very slow and only one application(neuron) is using the DMA. + * So instead of reading CSR use hardware reset value(from datasheet) as + * default value. + * + * @udma: udma structure + * + * Return: 0 if UDMA is initialized successfully, a negative error code otherwise. + */ +int udma_cache_defaults(struct udma *udma); + /** * udma_q_init() - Initialize the udma queue. * @@ -227,6 +249,15 @@ int udma_init(struct udma *udma, struct udma_params *udma_params); */ int udma_q_init(struct udma *udma, u32 qid, struct udma_q_params *q_params); +/** + * udma_q_enable() - Enables a udma queue + * + * @udma_q: udma queue data structure + * @enable: flag to enable/disable + * + */ +void udma_q_enable(struct udma_q *udma_q, int enable); + /** * udma_q_pause() - Pauses a udma queue * @@ -290,7 +321,7 @@ void udma_m2m_mask_ring_id_error(struct udma *udma, void __iomem *intc_base); * @udma: udma data structure * @state: new state to set * -* Return: 0 on success, a negative error code otherwise. + * Return: 0 on success, a negative error code otherwise. */ int udma_state_set(struct udma *udma, enum udma_state state); @@ -303,6 +334,17 @@ int udma_state_set(struct udma *udma, enum udma_state state); */ enum udma_state udma_state_get(struct udma *udma, enum udma_type type); + +/** + * udma_set_max_descs_and_prefetch() - set maximum number descriptors per one DMA packet + * + * @udma: udma handle + * @max_descs: max desc per packet + * + * Return: 0 on success, a negative error code otherwise. + */ +int udma_set_max_descs_and_prefetch(struct udma *udma, u8 max_descs); + /** * udma_available_get() - Get number of descriptors that can be submitted to the udma. * diff --git a/udma/udma_m2m.c b/udma/udma_m2m.c index 37372d5..c79fd9a 100644 --- a/udma/udma_m2m.c +++ b/udma/udma_m2m.c @@ -9,6 +9,7 @@ #include "udma.h" #include "../neuron_arch.h" +#include "../neuron_dhal.h" /* Note on terminology: * for historical reasons the code uses both m2s/s2m and Tx/Rx terminology @@ -99,7 +100,7 @@ static void sdma_m2s_set_write_barrier(uint32_t *meta_ctrl) } /* set maximum number descriptors per one DMA packet */ -static int udma_set_max_descs_and_prefetch(struct udma *udma, u8 max_descs) +int udma_set_max_descs_and_prefetch(struct udma *udma, u8 max_descs) { // Due to DGE bug on V3 (https://tiny.amazon.com/tfw2hept) // Min burst must equal Max burst, which is 8 @@ -467,7 +468,7 @@ void udma_m2m_set_axi_error_abort(struct udma *udma) // step 2: program axi error control for (i = 0; i < 6; i++) { - for (q = 0; q < DMA_MAX_Q_MAX; q++) { + for (q = 0; q < ndhal->ndhal_udma.num_queues; q++) { reg_write32(&gen_regs->axi_error_control[i].table_addr, (q << 3) | 0x7); reg_write32(&gen_regs->axi_error_control[i].table_data, 0x10); } diff --git a/udma/udma_main.c b/udma/udma_main.c index 7390c6a..4147289 100644 --- a/udma/udma_main.c +++ b/udma/udma_main.c @@ -64,7 +64,7 @@ static int udma_m2s_packet_size_cfg_set(struct udma *udma, struct udma_m2s_pkt_l #define UDMA_AXI_M2S_DATA_RD_CFG_ALWAYS_BREAK_ON_MAX_BOUDRY (1 << 16) /* set default configuration of one DMA engine */ -static int udma_set_defaults(struct udma *udma) +int udma_set_defaults(struct udma *udma) { int ret = 0; struct udma_gen_ex_regs __iomem *gen_ex_regs; @@ -116,7 +116,7 @@ static int udma_set_defaults(struct udma *udma) /* Set addr_hi selectors */ gen_ex_regs = (struct udma_gen_ex_regs __iomem *)udma->gen_ex_regs; - for (i = 0; i < DMA_MAX_Q_V4; i++) + for (i = 0; i < ndhal->ndhal_udma.num_queues; i++) reg_write32(&gen_ex_regs->vmpr_v4[i].tx_sel, 0xffffffff); /* Set M2S data read master configuration */ @@ -128,7 +128,7 @@ static int udma_set_defaults(struct udma *udma) /* Set addr_hi selectors */ gen_ex_regs = (struct udma_gen_ex_regs __iomem *)udma->gen_ex_regs; - for (i = 0; i < DMA_MAX_Q_V4; i++) { + for (i = 0; i < ndhal->ndhal_udma.num_queues; i++) { reg_write32(&gen_ex_regs->vmpr_v4[i].rx_sel[0], 0xffffffff); reg_write32(&gen_ex_regs->vmpr_v4[i].rx_sel[1], 0xffffffff); reg_write32(&gen_ex_regs->vmpr_v4[i].rx_sel[2], 0xffffffff); @@ -181,10 +181,10 @@ static int udma_set_defaults(struct udma *udma) * So instead of reading CSR use hardware reset value(from datasheet) as * default value. */ -static int udma_cache_defaults(struct udma *udma) +int udma_cache_defaults(struct udma *udma) { int i; - for (i = 0; i < DMA_MAX_Q_V4; i++) { + for (i = 0; i < ndhal->ndhal_udma.num_queues; i++) { struct udma_q *q = &udma->udma_q_m2s[i]; q->cfg = M2S_CFG_RESET_VALUE; q->rlimit_mask = M2S_RATE_LIMIT_RESET_VALUE; @@ -259,7 +259,7 @@ static int udma_q_set_pointers(struct udma_q *udma_q) /** enable/disable udma queue */ -static void udma_q_enable(struct udma_q *udma_q, int enable) +void udma_q_enable(struct udma_q *udma_q, int enable) { u32 reg; diff --git a/udma/udma_regs.h b/udma/udma_regs.h index f20ac3f..a24d589 100644 --- a/udma/udma_regs.h +++ b/udma/udma_regs.h @@ -92,7 +92,9 @@ struct udma_m2s_feature { struct udma_m2s_q { /* [0x0] M2S descriptor prefetch configuration */ u32 desc_pref_cfg; - u32 reserved0[7]; + /* [0x4] M2S descriptor prefetch configuration 2 */ + u32 desc_pref_cfg2; + u32 reserved0[6]; /* [0x20] M2S descriptor ring configuration */ u32 cfg; /* [0x24] M2S descriptor ring status and information */ @@ -345,7 +347,11 @@ struct udma_s2m_comp { }; struct udma_s2m_q { - u32 reserved0[8]; + /* [0x0] M2S descriptor prefetch configuration */ + u32 desc_pref_cfg; + /* [0x4] M2S descriptor prefetch configuration 2 */ + u32 desc_pref_cfg2; + u32 reserved0[6]; /* [0x20] S2M Descriptor ring configuration */ u32 cfg; /* [0x24] S2M Descriptor ring status and information */ diff --git a/v2/address_map.h b/v2/address_map.h index b0309a2..5607b43 100644 --- a/v2/address_map.h +++ b/v2/address_map.h @@ -17,7 +17,6 @@ #define V2_PCIE_ALL_RT_MASK 0x01f00000000000ull // relative to nc -#define V2_MMAP_P_OFFSET 0x00000000000000ull #define V2_MMAP_NC_EVENT_OFFSET 0x00000002700000ull #define V2_MMAP_NC_SEMA_READ_OFFSET V2_MMAP_NC_EVENT_OFFSET + 0x00000000001000ull #define V2_MMAP_NC_SEMA_SET_OFFSET V2_MMAP_NC_EVENT_OFFSET + 0x00000000001400ull @@ -35,8 +34,6 @@ // relative to V2 address space #define V2_APB_MISC_RAM_OFFSET 0x000ffff0fa0000ull -#define V2_MMAP_NC_SIZE 0x00000004000000ull - // Number of dice per chip #define V2_NUM_DIE_PER_DEVICE 1 diff --git a/v2/neuron_dhal_v2.c b/v2/neuron_dhal_v2.c index d196952..5fe4e61 100644 --- a/v2/neuron_dhal_v2.c +++ b/v2/neuron_dhal_v2.c @@ -26,7 +26,6 @@ extern int dev_nc_map; #define NR_RESET_RETRY_SLEEP_MS 100 #define V2_NR_RESET_INIT_MAX_TOTAL_WAIT_TIME_MS (1000 * 120) -#define V2_NR_RESET_POLL_INTERVAL 100 struct neuron_dm_special_mmap_ent dm_mmap_special_v2[] = { DM_SPECIAL_MM_ENT( NEURON_DM_BLOCK_TPB, 0, NEURON_DM_RESOURCE_SEMAPHORE, V2_MMAP_TPB_OFFSET, V2_PCIE_BAR0_TPB_0_OFFSET, V2_MMAP_TPB_SIZE, V2_MMAP_NC_EVENT_OFFSET, V2_MMAP_NC_SEMA_SIZE, 0), @@ -149,7 +148,7 @@ static int nr_initiate_reset_v2(struct neuron_device *nd, uint32_t nc_map) uint32_t tpb_reset_map = 0; nr_get_tpb_reset_map(nc_map, &tpb_reset_map); - int ret = nr_initiate_reset_via_fw(nd, nc_map, tpb_reset_map); + int ret = nr_initiate_reset_via_fw(nd, nc_map, tpb_reset_map, 0); if (ret) { return ret; } @@ -168,7 +167,7 @@ static int nr_initiate_reset_v2_qemu(struct neuron_device *nd, uint32_t nc_map) uint32_t tpb_reset_map = 0; nr_get_tpb_reset_map(nc_map, &tpb_reset_map); - int ret = nr_initiate_reset_via_fw(nd, nc_map, tpb_reset_map); + int ret = nr_initiate_reset_via_fw(nd, nc_map, tpb_reset_map, 0); if (ret) { return ret; } @@ -240,8 +239,9 @@ static int nr_wait_for_reset_completion_v2_emu(struct neuron_device *nd) * * @param nd - Neuron device which will be reset by the thread. */ -static int nr_post_reset_config_v2(struct neuron_device *nd, bool reset_successful) +static int nr_post_reset_config_v2(struct neuron_device *nd, bool reset_successful, bool is_no_reset) { + nd->supports_hbm_7200 = 0; return 0; } @@ -367,29 +367,35 @@ static void ts_nq_destroy_one_v2(struct neuron_device *nd, u8 ts_id) /* Neuron Core Functions */ /** * nc_get_semaphore_base() - get semaphore base address - * + * * @param nd - neuron device * @param nc_id - neuron core index - * @return void* - semaphore base address + * @param sem_base - resulting semaphore base address + * + * Return: 0 on success, a negative error code otherwise. */ -static void *nc_get_semaphore_base_v2(struct neuron_device *nd, u8 nc_id) +static int nc_get_semaphore_base_v2(struct neuron_device *nd, u8 nc_id, void **sem_base) { - return nd->npdev.bar0 + V2_PCIE_BAR0_TPB_0_OFFSET + (V2_PCIE_BAR0_TPB_0_SIZE * nc_id); + (*sem_base) = nd->npdev.bar0 + V2_PCIE_BAR0_TPB_0_OFFSET + (V2_PCIE_BAR0_TPB_0_SIZE * nc_id); + return 0; } /** * nc_get_event_addr() - get event address - * + * * @param nd - neuron device * @param nc_id - neuron core index * @param event_index - event index - * @return void* - event address + * @param ev_addr - resulting event address + * + * Return: 0 on success, a negative error code otherwise. */ -static void *nc_get_event_addr_v2(struct neuron_device *nd, u8 nc_id, u16 event_index) +static int nc_get_event_addr_v2(struct neuron_device *nd, u8 nc_id, u16 event_index, void **ev_addr) { void *base = nd->npdev.bar0 + V2_PCIE_BAR0_TPB_0_OFFSET + (V2_PCIE_BAR0_TPB_0_SIZE * nc_id) + ndhal->ndhal_address_map.mmap_nc_event_offset; - return (base + (event_index * NC_EVENT_SIZE)); + (*ev_addr) = (base + (event_index * NC_EVENT_SIZE)); + return 0; } @@ -447,7 +453,7 @@ static void nnq_set_hwaddr_v2(struct neuron_device *nd, u8 nc_id, u8 index, u32 * @param device_dram_addr: DRAM Channel 0 and 1's addresses * @param device_dram_size: DRAM Channel 0 and 1's sizes */ -static void mpset_set_dram_and_mpset_info_v2(struct mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size) +static void mpset_set_dram_and_mpset_info_v2(struct neuron_mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size) { mpset->num_channels = V2_MAX_DRAM_CHANNELS; mpset->mp_device_num_regions = 1; @@ -459,60 +465,6 @@ static void mpset_set_dram_and_mpset_info_v2(struct mempool_set *mpset, u64 *dev ndhal->ndhal_mpset.device_dram_end_addr[1] = device_dram_addr[1] + device_dram_size[1]; } -// Upper 16MB is used internally by the firmware, don't use it in the allocation pool -#define MEMPOOL_CARVEOUT_SIZE 0x1000000 // 16MB -/** - * mpset_block_carveout_regions() - * - in v2, block carve out regions: Upper 16 MB is used internally by firmware - * - * @param nd: neuron device - * @param mpset: pointer to mpset - * @param device_dram_addr: DRAM Channel 0's and 1's addresses - * @param device_dram_size: DRAM Channel 0's and 1's sizes - * @param region_sz: region size - * @return int: 0 on success, o/w on failure - */ -static int mpset_block_carveout_regions_v2(struct neuron_device *nd, struct mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size) -{ - int ret; - u64 region_sz; - int channel = 0, region = 0; - - /* - * Block carve out regions: Upper 16 MB is used internally by firmware for trainuim - * - * Ideally we would carve out by simply changing the start address of the chunk; - * however, that breaks aligned allocation in 4.x kernel versions (fixed in 5.x). - * Fix here: - * commit 52fbf1134d479234d7e64ba9dcbaea23405f229e - * Author: Alexey Skidanov - * Date: Thu Jan 3 15:26:44 2019 -0800 - * - * lib/genalloc.c: fix allocation of aligned buffer from non-aligned chunk - */ - for (channel = 0; channel < mpset->num_channels; channel++) { - region_sz = device_dram_size[channel] / mpset->mp_device_num_regions; - for (region = 0; region < mpset->mp_device_num_regions; region++) { - const dma_addr_t start_addr = device_dram_addr[channel] + (region * region_sz); - struct mem_chunk *mc = NULL; - u32 nc_id = channel; - ret = mc_alloc_align(nd, MC_LIFESPAN_DEVICE, MEMPOOL_CARVEOUT_SIZE, 0, MEM_LOC_DEVICE, channel, region, nc_id, NEURON_MEMALLOC_TYPE_NCDEV_DEVICE, &mc); - if (ret) { - pr_err("failed to allocate hbm carveout region: ret=%d\n", ret); - return -ENOMEM; - } - if (mc->pa != start_addr) { - pr_err("carve out mc not offset 0!"); - mc_free(&mc); - return -EINVAL; - } - } - ndhal->ndhal_mpset.device_dram_effective_base_addr[channel] = device_dram_addr[channel] + MEMPOOL_CARVEOUT_SIZE; - } - - return 0; -} - /* DMA Ring Functions */ /** @@ -619,26 +571,6 @@ static int ndmar_quiesce_queues_v2(struct neuron_device *nd, u32 nc_id, u32 engi return 0; } -/** ndmar_set_model_started() - * - * Checks to see if the pa belongs to PE IRAM FIFO offset. If so, then these - * descs are used to load the iram. The mem chunk is going to have all the descriptors - * to load the instructions in iram. So go through all the dma queues and check if this mem chunk is - * in that queue. Once we have the queue we set that queue to have descs - * for iram. The actual copy start of the queue would come when model is started and at that time - * set the state of model start for this nc. - * - * @nd: Neuron device which contains the DMA engine - * @pa: pa to check - * @mc: mem chunk that has descs - * - * Return: None - */ -static void ndmar_set_model_started_v2(struct neuron_device *nd, phys_addr_t pa, struct mem_chunk *mc) -{ - return; -} - /* FWIO Functions */ const int trn1_32xl_neigbor_ids[16][4] = { @@ -766,6 +698,11 @@ static int fw_io_read_csr_array_v2(void **ptrs, u32 *values, u32 num_csrs, bool if (num_csrs > FW_IO_MAX_READLESS_READ_REGISTER_COUNT) return -EINVAL; + // Force virtual platforms onto the direct path + if (narch_is_qemu() || narch_is_emu()) { + fw_io_read_csr_array_direct(ptrs, values, num_csrs, operational); + } + return fw_io_read_csr_array_direct(ptrs, values, num_csrs, operational); } @@ -803,37 +740,6 @@ static int fw_io_post_metric_v2(struct fw_io_ctx *ctx, u8 *data, u32 size) } -/* Register Access (read and write) Functions */ -/** - * reg_read32_array() - read an array of 32bit registers. - * - * @addr: register address. - * @value: read value would be stored here. - * @num_values: num values to read - * - * Return: 0 if read succeeds, a negative error code otherwise. - */ -inline int reg_read32_array_v2(void **addr, u32 *value, u32 num_values) -{ - int ret; - ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(addr, value, num_values, true); - if (ret != 0) { - pr_err("register read failure while reading %p\n", addr[0]); - dump_stack(); - } - return ret; -} - -inline int reg_read32_array_v2_qemu_emu(void **addr, u32 *value, u32 num_values) -{ - int i; - for (i = 0; i < num_values; i++) { - value[i] = readl(addr[i]); - } - return 0; -} - - /* Memory Map Functions */ /** * mmap_get_bar4_offset() - calculate the offset of BAR4 @@ -933,146 +839,6 @@ static int nsysfsmetric_add_tensor_engine_node_v2(struct nsysfsmetric_metrics *m /* PCI Functions */ -/** - * neuron_pci_release_bar() - Release a PCI BAR - * - * @param dev: PCI device whose resources were previously reserved by pci_request_region() - * @param bar: BAR to be reserved - * - * for V2, this function is dummy - */ -static int neuron_pci_release_bar_v2(struct pci_dev *dev, int bar) -{ - if (bar != ndhal->ndhal_pci.apb_bar && bar != ndhal->ndhal_pci.axi_bar && bar != ndhal->ndhal_pci.dram_bar) { - pci_info(dev, "invalid BAR%d\n", bar); - return -ENODEV; - } - if (bar == BAR_UNUSED) { - return 0; - } - - pci_release_region(dev, bar); - return 0; -} - -/** - * neuron_pci_reserve_bar() - Mark the PCI region associated with PCI BAR as being reserved - * - * @param dev: PCI device whose resources are to be reserved - * @param bar: BAR to be reserved - * @param res_name: Name to be associated with resource. - * @return int: Returns 0 on success, otherwise failure - */ -static int neuron_pci_reserve_bar_v2(struct pci_dev *dev, int bar, const char *res_name) -{ - int ret; - - if (bar != ndhal->ndhal_pci.apb_bar && bar != ndhal->ndhal_pci.axi_bar && bar != ndhal->ndhal_pci.dram_bar) { - pci_info(dev, "invalid BAR%d\n", bar); - return -ENODEV; - } - if (bar == BAR_UNUSED) { - return 0; - } - - ret = pci_request_region(dev, bar, res_name); - if (ret) { - pci_info(dev, "BAR %d: can't reserve %s\n", bar, res_name); - return -ENODEV; - } - - return 0; -} - - /** - * neuron_pci_set_npdev() - set BAR's physical addr, io addr, and size of neuron_pci_device - * - * @param dev: PCI device that owns the BAR - * @param bar: BAR number - * @param res_name: Name associated with resource - * @param bar_pa: start physical address of BAR - * @param bar_ioaddr: __iomem address to device BAR - * @param bar_size: size of BAR - * @return int: Returns 0 on success, otherwise failure - */ -static int neuron_pci_set_npdev_v2(struct pci_dev *dev, - int bar, - const char *res_name, - phys_addr_t *bar_pa, - void __iomem **bar_ioaddr, - u64 *bar_size) -{ - if (bar != ndhal->ndhal_pci.apb_bar && bar != ndhal->ndhal_pci.axi_bar && bar != ndhal->ndhal_pci.dram_bar) { - pci_info(dev, "invalid BAR%d\n", bar); - return -ENODEV; - } - if (bar == BAR_UNUSED) { - return 0; - } - - if (pci_resource_len(dev, bar) == 0) { - pci_info(dev, "BAR%d len is 0\n", bar); - return -ENODEV; - } - - *bar_pa = pci_resource_start(dev, bar); - if (!(*bar_pa)) { - pci_info(dev, "Can't get start address of BAR%d %s\n", bar, res_name); - return -ENODEV; - } - *bar_size = pci_resource_len(dev, bar); - - if (bar == ndhal->ndhal_pci.dram_bar) { - ndhal->ndhal_pci.dram_bar_size = *bar_size; - } - - if (bar == ndhal->ndhal_pci.dram_bar && wc_enable) - *bar_ioaddr = pci_iomap_wc(dev, bar, pci_resource_len(dev, bar)); - else - *bar_ioaddr = pci_iomap(dev, bar, pci_resource_len(dev, bar)); - - return 0; -} - -extern int dup_helper_enable; -static atomic_t dup_rid_cnt = ATOMIC_INIT(0); // count of duplicate routing IDs encountered -static int neuron_pci_handle_dup_routing_id(void) -{ - int ret = -ENODEV; - int dup_cnt; - char cmd[256]; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) - dup_cnt = atomic_fetch_add(1, &dup_rid_cnt); -#else - dup_cnt = atomic_add_return(1, &dup_rid_cnt) - 1; -#endif - - // If this is the first dup encounted, unload the driver - if ((dup_cnt == 0) && dup_helper_enable) { - pr_err("scheduling unload of %s due to duplicate routing id\n", module_name(THIS_MODULE)); - - int n = snprintf(cmd, sizeof(cmd), "sleep 10;/sbin/modprobe -r %s", module_name(THIS_MODULE)); - if (n > sizeof(cmd)) { - pr_err("unable to schedule driver unload cmd buffer len exceeded\n"); - return -EINVAL; - } - char *argv[] = { "/bin/sh", - "-c", - cmd, - NULL}; - static char *envp[] = { "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL}; - - ret = call_usermodehelper( argv[0], argv, envp, UMH_WAIT_EXEC); - if (ret) - pr_err("unable to schedule driver unload. Error: %d\n", ret); - } - - return ret; -} // for V2 rename Neuron devices for better customer experience. // see internal documentation: TRN1-Discovery @@ -1202,31 +968,6 @@ static void ncdev_compatible_version_v2(struct neuron_ioctl_compatible_version * arg->max = V2_RT_MAX_COMPATIBLE_VERSION; } -/** - * ncdev_quiesce_exec_on_proc_exit() - * - * Note: - * When a process is killed, the driver resets DMA but there is no - * way to soft reset neuron cores. This causes problem if the - * process was executing serial TPB or switching activation tables, - * which result in abrubtly stopping DMA engines hence engines are - * are blocked on semaphores. This results in next model - * load failure or inference timeout. - * - * Proper way is clearing out semaphore, events after resetting - * DMA engines. However, it is a lot of code change, hence - * adding a sleep for 1 second when process exits, which allows - * the NeuronCore to continue to execute for a second. Since - * no new inference can be submitted during this time, NeuronCore - * state would be cleared out. - * - */ -static void ncdev_quiesce_exec_on_proc_exit_v2(void) -{ - // for V2, the 1 second DMA queisce delay in flush was eliminated to improve nrt_init performance - return; -} - static void ncdev_get_default_tpbs_for_hbm_v2(u32 hbm_index, u32 tpbs[MAX_NC_PER_DEVICE], u32 *tpb_count) { tpbs[0] = hbm_index; @@ -1257,18 +998,6 @@ static void ndma_get_wait_for_completion_time_v2(u32 count, bool async, u64 *fir *following_wait_time *= 100; } -static void ndma_get_wait_for_completion_time_v2_qemu(u32 count, bool async, u64 *first_wait_time, u64 *following_wait_time) -{ - ndma_get_wait_for_completion_time_v2(count, async, first_wait_time, following_wait_time); - *following_wait_time *= 10 * 1000; -} - -static void ndma_get_wait_for_completion_time_v2_emu(u32 count, bool async, u64 *first_wait_time, u64 *following_wait_time) -{ - ndma_get_wait_for_completion_time_v2(count, async, first_wait_time, following_wait_time); - *following_wait_time *= 100 * 1000; -} - /** * ndma_validate_pa() - check the validity of the desc physical addresses * west side: PCIEX4_1_BASE: 0x00c00000000000 host: PCIEX8_0_BASE: 0x00400000000000 @@ -1537,6 +1266,27 @@ static int perf_set_profile_v2(struct neuron_device *nd, uint32_t profile) return 0; } +static int perf_get_profile_v2(struct neuron_device *nd, uint32_t *profile) +{ + // NOP implementation for v2 - return default profile value 0 + if (!profile) { + return -EINVAL; + } + *profile = 0; + return 0; +} + +static int perf_get_supported_profiles_v2(struct neuron_device *nd, u16 feature, u8 *num_profiles, u8 out_bitmap[32]) +{ + *num_profiles = 0; + return 0; +} + +static void perf_update_hbm_7200_supported_v2(struct neuron_device *nd) { + nd->supports_hbm_7200 = 0; + return; +} + /** * npe_class_node_id_show_data() - return sysfs class node_id * @@ -1633,7 +1383,6 @@ int ndhal_register_funcs_v2(void) { } ndhal->ndhal_address_map.pci_host_base = V2_PCIE_A0_BASE; - ndhal->ndhal_address_map.mmap_p_offset = V2_MMAP_P_OFFSET; ndhal->ndhal_address_map.mmap_nc_event_offset = V2_MMAP_NC_EVENT_OFFSET; ndhal->ndhal_address_map.mmap_nc_sema_read_offset = V2_MMAP_NC_SEMA_READ_OFFSET; ndhal->ndhal_address_map.mmap_nc_sema_set_offset = V2_MMAP_NC_SEMA_SET_OFFSET; @@ -1641,7 +1390,6 @@ int ndhal_register_funcs_v2(void) { ndhal->ndhal_address_map.mmap_nc_sema_decr_offset = V2_MMAP_NC_SEMA_DECR_OFFSET; ndhal->ndhal_address_map.bar0_misc_ram_offset = V2_MMAP_BAR0_APB_MISC_RAM_OFFSET; ndhal->ndhal_address_map.port_1_base = 0ull; - ndhal->ndhal_address_map.mmap_nc_size = V2_MMAP_NC_SIZE; ndhal->ndhal_address_map.nc_per_device = V2_NC_PER_DEVICE; ndhal->ndhal_address_map.dev_nc_map = (1 << V2_NC_PER_DEVICE) - 1; ndhal->ndhal_address_map.dice_per_device = V2_NUM_DIE_PER_DEVICE; @@ -1650,7 +1398,6 @@ int ndhal_register_funcs_v2(void) { ndhal->ndhal_address_map.ts_per_device = V2_TS_PER_DEVICE; ndhal->ndhal_address_map.dma_eng_per_nc = V2_DMA_ENG_PER_NC; ndhal->ndhal_address_map.dram_channels = V2_MAX_DRAM_CHANNELS; - ndhal->ndhal_reset.reset_poll_interval = V2_NR_RESET_POLL_INTERVAL; ndhal->ndhal_reset.initiate_max_wait_time = V2_NR_RESET_INIT_MAX_TOTAL_WAIT_TIME_MS; ndhal->ndhal_reset.retry_count = NR_RESET_RETRY_COUNT; ndhal->ndhal_reset.nr_post_reset_config = nr_post_reset_config_v2; @@ -1665,14 +1412,12 @@ int ndhal_register_funcs_v2(void) { ndhal->ndhal_mpset.mp_min_alloc_size = (mempool_min_alloc_size < 1024) ? 1024 : mempool_min_alloc_size; // v2 has a bigger mem size and gen pool create fails if < 1024 ndhal->ndhal_mpset.small_pool_supported = true; ndhal->ndhal_mpset.mpset_set_dram_and_mpset_info = mpset_set_dram_and_mpset_info_v2; - ndhal->ndhal_mpset.mpset_block_carveout_regions = mpset_block_carveout_regions_v2; ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id = ndmar_get_h2t_eng_id_v2; ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid = ndmar_get_h2t_def_qid_v2; ndhal->ndhal_ndmar.ndmar_is_h2t_def_q = ndmar_is_h2t_def_q_v2; ndhal->ndhal_ndmar.nr_init_h2t_eng = nr_init_h2t_eng_v2; ndhal->ndhal_ndmar.ndmar_is_nx_ring = ndmar_is_nx_ring_v2; ndhal->ndhal_ndmar.ndmar_quiesce_queues = ndmar_quiesce_queues_v2; - ndhal->ndhal_ndmar.ndmar_set_model_started = ndmar_set_model_started_v2; ndhal->ndhal_fw_io.fw_io_topology = fw_io_topology_v2; ndhal->ndhal_fw_io.fw_io_register_readless_read_region = fw_io_register_readless_read_region_v2; ndhal->ndhal_fw_io.fw_io_read_csr_array = fw_io_read_csr_array_v2; @@ -1687,17 +1432,14 @@ int ndhal_register_funcs_v2(void) { ndhal->ndhal_sysfs_metrics.nsysfsmetric_add_tensor_engine_node = nsysfsmetric_add_tensor_engine_node_v2; ndhal->ndhal_pci.axi_bar = BAR_UNUSED; ndhal->ndhal_pci.dram_bar = 4; - ndhal->ndhal_pci.neuron_pci_release_bar = neuron_pci_release_bar_v2; - ndhal->ndhal_pci.neuron_pci_reserve_bar = neuron_pci_reserve_bar_v2; - ndhal->ndhal_pci.neuron_pci_set_npdev = neuron_pci_set_npdev_v2; ndhal->ndhal_pci.neuron_pci_get_device_id = neuron_pci_get_device_id_v2; ndhal->ndhal_pci.neuron_pci_device_id_to_rid_map = neuron_pci_device_id_to_rid_map_v2; ndhal->ndhal_cdev.ncdev_mem_regions = ncdev_mem_regions_v2; ndhal->ndhal_cdev.ncdev_bar0_write_blocked_addrs = ncdev_bar0_write_blocked_addrs_v2; ndhal->ndhal_cdev.ncdev_compatible_version = ncdev_compatible_version_v2; - ndhal->ndhal_cdev.ncdev_quiesce_exec_on_proc_exit = ncdev_quiesce_exec_on_proc_exit_v2; ndhal->ndhal_cdev.ncdev_logical_to_physical_nc_map = NULL; ndhal->ndhal_cdev.ncdev_get_default_tpbs_for_hbm = ncdev_get_default_tpbs_for_hbm_v2; + ndhal->ndhal_udma.num_queues = DMA_MAX_Q_V4; ndhal->ndhal_udma.num_beats = 1024; // >= UDMA_REV_ID_4 ndhal->ndhal_ndma.ndma_retry_memcpy = true; ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v2; @@ -1714,6 +1456,9 @@ int ndhal_register_funcs_v2(void) { ndhal->ndhal_npe.npe_class_server_id_show_data = npe_class_server_id_show_data_v2; ndhal->ndhal_npe.npe_class_ultraserver_mode_show_data = npe_class_ultraserver_mode_show_data_v2; ndhal->ndhal_perf.perf_set_profile = perf_set_profile_v2; + ndhal->ndhal_perf.perf_get_profile = perf_get_profile_v2; + ndhal->ndhal_perf.perf_get_supported_profiles = perf_get_supported_profiles_v2; + ndhal->ndhal_perf.perf_update_hbm_7200_supported = perf_update_hbm_7200_supported_v2; ndhal->ndhal_tpb.pe_xbus_count = 5; ndhal->ndhal_tpb.pe_row_grp_count = 4; ndhal->ndhal_tpb.pe_col_grp_count = 4; @@ -1730,9 +1475,7 @@ int ndhal_register_funcs_v2(void) { ndhal->ndhal_reset.nr_wait_for_reset_completion = nr_wait_for_reset_completion_v2_qemu; ndhal->ndhal_address_map.seng_dma_eng_per_nd = V2_NC_PER_DEVICE * V2_DMA_ENG_PER_NC; ndhal->ndhal_address_map.h2d_dma_eng_per_nd = V2_NUM_H2D_DMA_PER_DEVICE; - ndhal->ndhal_reg_access.reg_read32_array = reg_read32_array_v2_qemu_emu; ndhal->ndhal_pci.apb_bar = 2; - ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v2_qemu; } else if (narch_is_emu()) { ndhal->ndhal_reset.retry_count *= 1000; // wait longer on the emulator ndhal->ndhal_reset.nr_initiate_reset = nr_initiate_reset_v2_emu; @@ -1741,15 +1484,12 @@ int ndhal_register_funcs_v2(void) { ndhal->ndhal_address_map.h2d_dma_eng_per_nd = nc_per_dev_param; ndhal->ndhal_address_map.nc_per_device = nc_per_dev_param; ndhal->ndhal_address_map.dev_nc_map = dev_nc_map; - ndhal->ndhal_reg_access.reg_read32_array = reg_read32_array_v2_qemu_emu; ndhal->ndhal_pci.apb_bar = 0; - ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v2_emu; } else { ndhal->ndhal_reset.nr_initiate_reset = nr_initiate_reset_v2; ndhal->ndhal_reset.nr_wait_for_reset_completion = nr_wait_for_reset_completion_v2; ndhal->ndhal_address_map.seng_dma_eng_per_nd = V2_NC_PER_DEVICE * V2_DMA_ENG_PER_NC; ndhal->ndhal_address_map.h2d_dma_eng_per_nd = V2_NUM_H2D_DMA_PER_DEVICE; - ndhal->ndhal_reg_access.reg_read32_array = reg_read32_array_v2; ndhal->ndhal_pci.apb_bar = 0; } diff --git a/v2/notific.c b/v2/notific.c index 9cdd9a7..ed5ea6d 100644 --- a/v2/notific.c +++ b/v2/notific.c @@ -9,9 +9,6 @@ #include "notific.h" -#define NOTIFIC_NQ_SIZE 0x28 // total size of the NQ register space -#define NOTIFIC_NQ_HEAD_OFFSET 0x10c - static u64 seng_sdma_base[V2_MMAP_TPB_COUNT][V2_NUM_DMA_ENGINES_PER_TPB] = { { V2_APB_SENG_0_SDMA_0_BASE, V2_APB_SENG_0_SDMA_1_BASE, V2_APB_SENG_0_SDMA_2_BASE, V2_APB_SENG_0_SDMA_3_BASE, V2_APB_SENG_0_SDMA_4_BASE, V2_APB_SENG_0_SDMA_5_BASE, diff --git a/v2/notific.h b/v2/notific.h index fa62b45..f25ccf8 100644 --- a/v2/notific.h +++ b/v2/notific.h @@ -19,6 +19,7 @@ */ #include "address_map.h" +#include "../neuron_nq.h" #include "../neuron_reg_access.h" /** Returns NOTIFIC relative offset for given the DMA engine for given NC. @@ -49,43 +50,3 @@ static inline u64 notific_get_relative_offset_topsp_v2(int ts_idx) int notific_decode_nq_head_reg_access_v2(u64 offset, u8 *nc_id, u32 *nq_type, u8 *instance, bool *is_top_sp); - -#define NOTIFIC_NQ_SIZE 0x28 -#define NOTIFIC_NQ_BASE_ADDR_LO_OFFSET_START 0x100 -#define NOTIFIC_NQ_BASE_ADDR_LO_OFFSET(index) (NOTIFIC_NQ_BASE_ADDR_LO_OFFSET_START + ((index)*NOTIFIC_NQ_SIZE) + 0) - -#define NOTIFIC_NQ_BASE_ADDR_LO_RESET_VALUE 0x00000000 - -static inline void notific_write_nq_base_addr_lo(void __iomem *base, size_t index, - uint32_t value) -{ - const size_t offset = NOTIFIC_NQ_BASE_ADDR_LO_OFFSET(index); - - reg_write32(base + offset, value); -} - -#define NOTIFIC_NQ_BASE_ADDR_HI_OFFSET_START 0x104 -#define NOTIFIC_NQ_BASE_ADDR_HI_OFFSET(index) (NOTIFIC_NQ_BASE_ADDR_HI_OFFSET_START + ((index)*NOTIFIC_NQ_SIZE) + 0) - -#define NOTIFIC_NQ_BASE_ADDR_HI_RESET_VALUE 0x00000000 - -static inline void notific_write_nq_base_addr_hi(void __iomem *base, size_t index, - uint32_t value) -{ - const size_t offset = NOTIFIC_NQ_BASE_ADDR_HI_OFFSET(index); - - reg_write32(base + offset, value); -} - -#define NOTIFIC_NQ_F_SIZE_OFFSET_START 0x108 -#define NOTIFIC_NQ_F_SIZE_OFFSET(index) (NOTIFIC_NQ_F_SIZE_OFFSET_START + ((index)*NOTIFIC_NQ_SIZE) + 0) - -#define NOTIFIC_F_SIZE_RESET_VALUE 0x00000000 - -static inline void notific_write_nq_f_size(void __iomem *base, size_t index, - uint32_t value) -{ - const size_t offset = NOTIFIC_NQ_F_SIZE_OFFSET(index); - - reg_write32(base + offset, value); -} diff --git a/v3/address_map.h b/v3/address_map.h index 32c751f..d1c9ac1 100644 --- a/v3/address_map.h +++ b/v3/address_map.h @@ -15,7 +15,6 @@ #define V3_PCIE_B0_3_BASE 0x1c000000000000ull // relative to nc -#define V3_MMAP_P_OFFSET 0x0000000d0000000ull #define V3_MMAP_NC_EVENT_OFFSET 0x00000002700000ull #define V3_MMAP_NC_SEMA_READ_OFFSET V3_MMAP_NC_EVENT_OFFSET + 0x00000000001000ull #define V3_MMAP_NC_SEMA_SET_OFFSET V3_MMAP_NC_EVENT_OFFSET + 0x00000000001400ull diff --git a/v3/neuron_dhal_v3.c b/v3/neuron_dhal_v3.c index 52e2d11..40e683e 100644 --- a/v3/neuron_dhal_v3.c +++ b/v3/neuron_dhal_v3.c @@ -25,10 +25,10 @@ #include "neuron_pelect.h" extern int dev_nc_map; +extern int reset_top_dma; #define NR_RESET_RETRY_SLEEP_MS 100 #define V3_NR_RESET_INIT_MAX_TOTAL_WAIT_TIME_MS (1000 * 480) -#define V3_NR_RESET_POLL_INTERVAL 100 int force_userver = 0; module_param(force_userver , int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); @@ -280,18 +280,29 @@ static enum neuron_platform_type ndhal_platform_type_v3(void) uint8_t cc_top_bv = (reset_unit_index_bv >> 24) & 0xFU; // Note: 4b here instead of 8b * */ -static void nr_get_tpb_reset_map(uint32_t nc_map, uint32_t *tpb_reset_map) +static void nr_get_tpb_reset_map(uint32_t nc_map, uint32_t *tpb_reset_map_lo, uint32_t *tpb_reset_map_hi) { int i; + uint32_t seng_mask; // Build the tpb reset map if we are not performing a device reset if (nc_map != NEURON_NC_MAP_DEVICE) { for (i = 0; i < MAX_NC_PER_DEVICE; i++) { if ((1 << i) & nc_map) { // Add this tpb to the reset map - *tpb_reset_map |= (1 << i); - *tpb_reset_map |= (1 << (i+8)); // SDMA group for this core - *tpb_reset_map |= (1 << (i+16)); // TOP SP group for this core + *tpb_reset_map_lo |= (1 << i); + *tpb_reset_map_lo |= (1 << (i+8)); // SDMA group for this core + *tpb_reset_map_lo |= (1 << (i+16)); // TOP SP group for this core + } + } + + // Reset top DMA only if both NCs in SENG are being reset + if (reset_top_dma) { + for (i = 0; i < V3_SENG_PER_DEVICE; i++) { + seng_mask = ((1 << V3_NC_PER_SENG) - 1) << (i * V3_NC_PER_SENG); + if ((nc_map & seng_mask) == seng_mask) { + *tpb_reset_map_hi |= (1 << i); + } } } } @@ -304,29 +315,32 @@ static void nr_get_tpb_reset_map(uint32_t nc_map, uint32_t *tpb_reset_map) */ static int nr_initiate_reset_v3(struct neuron_device *nd, uint32_t nc_map) { + uint32_t tpb_reset_map_lo = 0, tpb_reset_map_hi = 0; + int ret; + if (no_reset) return 0; - uint32_t tpb_reset_map = 0; - nr_get_tpb_reset_map(nc_map, &tpb_reset_map); + nr_get_tpb_reset_map(nc_map, &tpb_reset_map_lo, &tpb_reset_map_hi); - int ret = nr_initiate_reset_via_fw(nd, nc_map, tpb_reset_map); - if (ret) { + ret = nr_initiate_reset_via_fw(nd, nc_map, tpb_reset_map_lo, tpb_reset_map_hi); + if (ret) return ret; - } return 0; } static int nr_initiate_reset_v3_qemu(struct neuron_device *nd, uint32_t nc_map) { + uint32_t tpb_reset_map_lo = 0, tpb_reset_map_hi = 0; + volatile void *addr; + if (no_reset) return 0; - uint32_t tpb_reset_map = 0; - nr_get_tpb_reset_map(nc_map, &tpb_reset_map); - volatile void *addr = nd->npdev.bar0 + V3_PCIE_BAR0_APB_IO_0_OFFSET + V3_APB_IO_0_USER_SE_0_RESERVED2_RELBASE + 0x10; - writel(tpb_reset_map, (volatile uint32_t *)addr); + nr_get_tpb_reset_map(nc_map, &tpb_reset_map_lo, &tpb_reset_map_hi); + addr = nd->npdev.bar0 + V3_PCIE_BAR0_APB_IO_0_OFFSET + V3_APB_IO_0_USER_SE_0_RESERVED2_RELBASE + 0x10; + writel(tpb_reset_map_lo, (volatile uint32_t *)addr); return 0; } @@ -396,8 +410,16 @@ static int nr_wait_for_reset_completion_v3_emu(struct neuron_device *nd) * @param nd - Neuron device which will be reset by the thread. * @param reset_successful - device reset was successful */ -static int nr_post_reset_config_v3(struct neuron_device *nd, bool reset_successful) +static int nr_post_reset_config_v3(struct neuron_device *nd, bool reset_successful, bool is_no_reset) { + if (reset_successful && !is_no_reset) { + if (nd->supports_hbm_7200 == -1) { + ndhal->ndhal_perf.perf_update_hbm_7200_supported(nd); + } + } else { + nd->supports_hbm_7200 = 0; + } + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD) { return 0; } @@ -532,11 +554,14 @@ static void ts_nq_destroy_one_v3(struct neuron_device *nd, u8 ts_id) * * @param nd - neuron device * @param nc_id - neuron core index - * @return void* - semaphore base address + * @param sem_base - resulting semaphore base address + * + * Return: 0 on success, a negative error code otherwise. */ -static void *nc_get_semaphore_base_v3(struct neuron_device *nd, u8 nc_id) +static int nc_get_semaphore_base_v3(struct neuron_device *nd, u8 nc_id, void **sem_base) { - return nd->npdev.bar0 + V3_PCIE_BAR0_TPB_0_OFFSET + (V3_PCIE_BAR0_TPB_DIST * nc_id); + (*sem_base) = nd->npdev.bar0 + V3_PCIE_BAR0_TPB_0_OFFSET + (V3_PCIE_BAR0_TPB_DIST * nc_id); + return 0; } /** @@ -545,12 +570,15 @@ static void *nc_get_semaphore_base_v3(struct neuron_device *nd, u8 nc_id) * @param nd - neuron device * @param nc_id - neuron core index * @param event_index - event index - * @return void* - event address + * @param ev_addr - resulting event address + * + * Return: 0 on success, a negative error code otherwise. */ -static void *nc_get_event_addr_v3(struct neuron_device *nd, u8 nc_id, u16 event_index) +static int nc_get_event_addr_v3(struct neuron_device *nd, u8 nc_id, u16 event_index, void **ev_addr) { void * base = nd->npdev.bar0 + V3_PCIE_BAR0_TPB_0_OFFSET + (V3_PCIE_BAR0_TPB_DIST * nc_id) + ndhal->ndhal_address_map.mmap_nc_event_offset; - return (base + (event_index * NC_EVENT_SIZE)); + (*ev_addr) = (base + (event_index * NC_EVENT_SIZE)); + return 0; } @@ -608,7 +636,7 @@ static void nnq_set_hwaddr_v3(struct neuron_device *nd, u8 nc_id, u8 index, u32 * @param device_dram_addr: DRAM Channel 0 and 1's addresses * @param device_dram_size: DRAM Channel 0 and 1's sizes */ -static void mpset_set_dram_and_mpset_info_v3(struct mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size) +static void mpset_set_dram_and_mpset_info_v3(struct neuron_mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size) { mpset->num_channels = V3_MAX_DRAM_CHANNELS; mpset->mp_device_num_regions = 1; @@ -648,60 +676,6 @@ static void mpset_set_dram_and_mpset_info_v3(struct mempool_set *mpset, u64 *dev } } -// Upper 16MB is used internally by the firmware, don't use it in the allocation pool -#define MEMPOOL_CARVEOUT_SIZE 0x1000000 // 16MB -/** - * mpset_block_carveout_regions() - * - in v3, block carve out regions: Upper 16 MB is used internally by firmware - * - * @param nd: neuron device - * @param mpset: pointer to mpset - * @param device_dram_addr: DRAM Channel 0's and 1's addresses - * @param device_dram_size: DRAM Channel 0's and 1's sizes - * @param region_sz: region size - * @return int: 0 on success, o/w on failure - */ -static int mpset_block_carveout_regions_v3(struct neuron_device *nd, struct mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size) -{ - int ret; - u64 region_sz; - int channel = 0, region = 0; - - /* - * Block carve out regions: Upper 16 MB is used internally by firmware for trainuim2 - * - * Ideally we would carve out by simply changing the start address of the chunk; - * however, that breaks aligned allocation in 4.x kernel versions (fixed in 5.x). - * Fix here: - * commit 52fbf1134d479234d7e64ba9dcbaea23405f229e - * Author: Alexey Skidanov - * Date: Thu Jan 3 15:26:44 2019 -0800 - * - * lib/genalloc.c: fix allocation of aligned buffer from non-aligned chunk - */ - for (channel = 0; channel < mpset->num_channels; channel++) { - region_sz = device_dram_size[channel] / mpset->mp_device_num_regions; - for (region = 0; region < mpset->mp_device_num_regions; region++) { - const dma_addr_t start_addr = device_dram_addr[channel] + (region * region_sz); - struct mem_chunk *mc = NULL; - u32 nc_id = channel; - ret = mc_alloc_align(nd, MC_LIFESPAN_DEVICE, MEMPOOL_CARVEOUT_SIZE, 0, MEM_LOC_DEVICE, channel, region, nc_id, NEURON_MEMALLOC_TYPE_NCDEV_DEVICE, &mc); - if (ret) { - pr_err("failed to allocate hbm carveout region: ret=%d\n", ret); - return -ENOMEM; - } - if (mc->pa != start_addr) { - pr_err("carve out mc not offset 0!"); - mc_free(&mc); - return -EINVAL; - } - } - ndhal->ndhal_mpset.device_dram_effective_base_addr[channel] = device_dram_addr[channel] + MEMPOOL_CARVEOUT_SIZE; - } - - return 0; -} - /* DMA Ring Functions */ /** @@ -833,26 +807,6 @@ static int ndmar_quiesce_queues_v3(struct neuron_device *nd, u32 nc_id, u32 engi return 0; } -/** ndmar_set_model_started() - * - * Checks to see if the pa belongs to PE IRAM FIFO offset. If so, then these - * descs are used to load the iram. The mem chunk is going to have all the descriptors - * to load the instructions in iram. So go through all the dma queues and check if this mem chunk is - * in that queue. Once we have the queue we set that queue to have descs - * for iram. The actual copy start of the queue would come when model is started and at that time - * set the state of model start for this nc. - * - * @nd: Neuron device which contains the DMA engine - * @pa: pa to check - * @mc: mem chunk that has descs - * - * Return: None - */ -static void ndmar_set_model_started_v3(struct neuron_device *nd, phys_addr_t pa, struct mem_chunk *mc) -{ - return; -} - /* FWIO Functions */ @@ -958,6 +912,11 @@ static int fw_io_read_csr_array_v3(void **ptrs, u32 *values, u32 num_csrs, bool if (num_csrs > FW_IO_MAX_READLESS_READ_REGISTER_COUNT) return -EINVAL; + // Force virtual platforms onto the direct path + if (narch_is_qemu() || narch_is_emu()) { + fw_io_read_csr_array_direct(ptrs, values, num_csrs, operational); + } + return fw_io_read_csr_array_direct(ptrs, values, num_csrs, operational); } @@ -995,37 +954,6 @@ static int fw_io_post_metric_v3(struct fw_io_ctx *ctx, u8 *data, u32 size) } -/* Register Access (read and write) Functions */ -/** - * reg_read32_array() - read an array of 32bit registers. - * - * @addr: register address. - * @value: read value would be stored here. - * @num_values: num values to read - * - * Return: 0 if read succeeds, a negative error code otherwise. - */ -inline int reg_read32_array_v3(void **addr, u32 *value, u32 num_values) -{ - int ret; - ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(addr, value, num_values, true); - if (ret != 0) { - pr_err("register read failure while reading %p\n", addr[0]); - dump_stack(); - } - return ret; -} - -inline int reg_read32_array_v3_qemu_emu(void **addr, u32 *value, u32 num_values) -{ - int i; - for (i = 0; i < num_values; i++) { - value[i] = readl(addr[i]); - } - return 0; -} - - /* Memory Map Functions */ /** * mmap_get_bar4_offset() - calculate the offset of BAR4 @@ -1167,158 +1095,6 @@ static int nsysfsmetric_add_tensor_engine_node_v3(struct nsysfsmetric_metrics *m /* PCI Functions */ -/** - * neuron_pci_release_bar() - Release a PCI BAR - * - * @param dev: PCI device whose resources were previously reserved by pci_request_region() - * @param bar: BAR to be reserved - * - * for V3, this function is dummy - */ -static int neuron_pci_release_bar_v3(struct pci_dev *dev, int bar) -{ - if (bar != ndhal->ndhal_pci.apb_bar && bar != ndhal->ndhal_pci.axi_bar && bar != ndhal->ndhal_pci.dram_bar) { - pci_info(dev, "invalid BAR%d\n", bar); - return -ENODEV; - } - if (bar == BAR_UNUSED) { - return 0; - } - - pci_release_region(dev, bar); - return 0; -} - -/** - * neuron_pci_reserve_bar() - Mark the PCI region associated with PCI BAR as being reserved - * - * @param dev: PCI device whose resources are to be reserved - * @param bar: BAR to be reserved - * @param res_name: Name to be associated with resource. - * @return int: Returns 0 on success, otherwise failure - */ -static int neuron_pci_reserve_bar_v3(struct pci_dev *dev, int bar, const char *res_name) -{ - int ret; - - if (bar != ndhal->ndhal_pci.apb_bar && bar != ndhal->ndhal_pci.axi_bar && bar != ndhal->ndhal_pci.dram_bar) { - pci_info(dev, "invalid BAR%d\n", bar); - goto err; - } - if (bar == BAR_UNUSED) { - return 0; - } - - ret = pci_request_region(dev, bar, res_name); - if (ret) { - pci_info(dev, "BAR %d: can't reserve %s\n", bar, res_name); - goto err; - } - - return 0; - -err: - //return -ENODEV; Until we can map BAR4 on cmdk - return (bar == 4)? 0:-ENODEV; - -} - - /** - * neuron_pci_set_npdev() - set BAR's physical addr, io addr, and size of neuron_pci_device - * - * @param dev: PCI device that owns the BAR - * @param bar: BAR number - * @param res_name: Name associated with resource - * @param bar_pa: start physical address of BAR - * @param bar_ioaddr: __iomem address to device BAR - * @param bar_size: size of BAR - * @return int: Returns 0 on success, otherwise failure - */ -static int neuron_pci_set_npdev_v3(struct pci_dev *dev, - int bar, - const char *res_name, - phys_addr_t *bar_pa, - void __iomem **bar_ioaddr, - u64 *bar_size) -{ - if (bar != ndhal->ndhal_pci.apb_bar && bar != ndhal->ndhal_pci.axi_bar && bar != ndhal->ndhal_pci.dram_bar) { - pci_info(dev, "invalid BAR%d\n", bar); - return -ENODEV; - } - if (bar == BAR_UNUSED) { - return 0; - } - - if (pci_resource_len(dev, bar) == 0) { - pci_info(dev, "BAR%d len is 0\n", bar); - goto err; - } - - *bar_pa = pci_resource_start(dev, bar); - if (!(*bar_pa)) { - pci_info(dev, "Can't get start address of BAR%d %s\n", bar, res_name); - goto err; - } - *bar_size = pci_resource_len(dev, bar); - - if (bar == ndhal->ndhal_pci.dram_bar) { - ndhal->ndhal_pci.dram_bar_size = *bar_size; - } - - if (bar == ndhal->ndhal_pci.dram_bar && wc_enable) - *bar_ioaddr = pci_iomap_wc(dev, bar, pci_resource_len(dev, bar)); - else - *bar_ioaddr = pci_iomap(dev, bar, pci_resource_len(dev, bar)); - - return 0; - -err: - //return -ENODEV; Until we can map BAR4 on cmdk - *bar_pa = 0; - *bar_size = 0; - *bar_ioaddr = NULL; - return 0; -} - -extern int dup_helper_enable; -static atomic_t dup_rid_cnt = ATOMIC_INIT(0); // count of duplicate routing IDs encountered -static int neuron_pci_handle_dup_routing_id(void) -{ - int ret = -ENODEV; - int dup_cnt; - char cmd[256]; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) - dup_cnt = atomic_fetch_add(1, &dup_rid_cnt); -#else - dup_cnt = atomic_add_return(1, &dup_rid_cnt) - 1; -#endif - - // If this is the first dup encounted, unload the driver - if ((dup_cnt == 0) && dup_helper_enable) { - pr_err("scheduling unload of %s due to duplicate routing id\n", module_name(THIS_MODULE)); - - int n = snprintf(cmd, sizeof(cmd), "sleep 10;/sbin/modprobe -r %s", module_name(THIS_MODULE)); - if (n > sizeof(cmd)) { - pr_err("unable to schedule driver unload cmd buffer len exceeded\n"); - return -EINVAL; - } - char *argv[] = { "/bin/sh", - "-c", - cmd, - NULL}; - static char *envp[] = { "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL}; - - ret = call_usermodehelper( argv[0], argv, envp, UMH_WAIT_EXEC); - if (ret) - pr_err("unable to schedule driver unload. Error: %d\n", ret); - } - - return ret; -} // for V3 rename Neuron devices for better customer experience. // see internal documentation: TRN2-Discovery @@ -1382,18 +1158,14 @@ static int neuron_pci_get_device_id_v3(struct neuron_device *nd, struct pci_dev } if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) { - u32 server_info = 0; - bool server_id_valid = 0; - u32 server_id = 0; - ret = fw_io_server_info_read(nd->npdev.bar0, &server_info); + int server_id; + + ret = fw_io_server_info_read(nd->npdev.bar0, &server_id, NULL); if (ret) { return -ENODEV; } - server_id_valid = (server_info >> 15) & 0x1; // TODO PDS we probably need const shift value or macro - if (server_id_valid) { - server_id = server_info & 0x7fff; // TODO PDS we probably need constant mask for this - } else { + if (server_id == -1) { pr_err("Could not retrieve valid server id, ret = %d\n", ret); return -ENODEV; } @@ -1499,31 +1271,6 @@ static void ncdev_compatible_version_v3(struct neuron_ioctl_compatible_version * arg->max = V3_RT_MAX_COMPATIBLE_VERSION; } -/** - * ncdev_quiesce_exec_on_proc_exit() - * - * Note: - * When a process is killed, the driver resets DMA but there is no - * way to soft reset neuron cores. This causes problem if the - * process was executing serial TPB or switching activation tables, - * which result in abrubtly stopping DMA engines hence engines are - * are blocked on semaphores. This results in next model - * load failure or inference timeout. - * - * Proper way is clearing out semaphore, events after resetting - * DMA engines. However, it is a lot of code change, hence - * adding a sleep for 1 second when process exits, which allows - * the NeuronCore to continue to execute for a second. Since - * no new inference can be submitted during this time, NeuronCore - * state would be cleared out. - * - */ -static void ncdev_quiesce_exec_on_proc_exit_v3(void) -{ - // for V3, the 1 second DMA queisce delay in flush was eliminated to improve nrt_init performance - return; -} - static void ncdev_get_default_tpbs_for_hbm_v3(u32 hbm_index, u32 tpbs[MAX_NC_PER_DEVICE], u32 *tpb_count) { tpbs[0] = hbm_index * 2; @@ -1555,18 +1302,6 @@ static void ndma_get_wait_for_completion_time_v3(u32 count, bool async, u64 *fir *following_wait_time *= 100; } -static void ndma_get_wait_for_completion_time_v3_qemu(u32 count, bool async, u64 *first_wait_time, u64 *following_wait_time) -{ - ndma_get_wait_for_completion_time_v3(count, async, first_wait_time, following_wait_time); - *following_wait_time *= 10 * 1000; -} - -static void ndma_get_wait_for_completion_time_v3_emu(u32 count, bool async, u64 *first_wait_time, u64 *following_wait_time) -{ - ndma_get_wait_for_completion_time_v3(count, async, first_wait_time, following_wait_time); - *following_wait_time *= 100 * 1000; -} - /** * ndma_validate_pa() - check the validity of the desc physical addresses * west side: PCIEX4_1_BASE: 0x00c00000000000 host: PCIEX8_0_BASE: 0x00400000000000 @@ -1962,11 +1697,58 @@ static int perf_set_profile_v3(struct neuron_device *nd, uint32_t profile) ret = fw_io_set_power_profile(nd->fw_io_ctx, profile); if (ret == 0) { ndhal->ndhal_perf.current_performance_profile = profile; + nd->current_perf_profile = profile; nmetric_set_performance_profile(nd, profile); + } else { + uint32_t cur_profile; + int retval = ndhal->ndhal_perf.perf_get_profile(nd, &cur_profile); + if (retval == 0) { + nd->current_perf_profile = cur_profile; + } else { + nd->current_perf_profile = 0; + } } return ret; } +static int perf_get_profile_v3(struct neuron_device *nd, uint32_t *profile) +{ + int ret; + if (!profile) { + return -EINVAL; + } + ret = fw_io_get_performance_profile(nd->fw_io_ctx, profile); + return ret; +} + +static int perf_get_supported_profiles_v3(struct neuron_device *nd, u16 feature, u8 *num_profiles, u8 out_bitmap[32]) +{ + return fw_io_get_available_profiles(nd->fw_io_ctx, feature, num_profiles, out_bitmap); +} + +static void perf_update_hbm_7200_supported_v3(struct neuron_device *nd) +{ + struct fw_io_get_available_profiles_response tmp; + int i; + int supports_hbm_7200 = 0; + int ret = fw_io_get_available_profiles(nd->fw_io_ctx, FW_IO_AVAILABLE_PERF_PROFILES_HBM_7200, &tmp.num_profiles, tmp.profiles_bitmap); + if (ret) { + nd->supports_hbm_7200 = 0; + return; + } + + for (i = 0; i < tmp.num_profiles; i++) { + int arr_idx = i / 8; + int bit_idx = i % 8; + if (tmp.profiles_bitmap[arr_idx] & (1 << bit_idx)) { + supports_hbm_7200 = 1; + break; + } + } + + nd->supports_hbm_7200 = supports_hbm_7200; +} + /** * npe_class_node_id_show_data() - return sysfs class node_id * @@ -1982,6 +1764,20 @@ static ssize_t npe_class_node_id_show_data_v3(char *buf, u32 sz) return npe_class_node_id_show_data(buf, sz); } +/** + * npe_class_node_cnt_show_data() - return sysfs class node_cnt + * + * @buf - sysfs buffer + * + */ +static ssize_t npe_class_node_cnt_show_data_v3(char *buf) +{ + if (ndhal->ndhal_arch.platform_type != NEURON_PLATFORM_TYPE_PDS) { + return dhal_sysfs_emit(buf, "-1\n"); + } + return npe_class_node_cnt_show_data(buf); +} + /** * npe_class_server_id_show_data() - return sysfs class node_id * @@ -2079,7 +1875,6 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_arch.platform_type = ndhal_platform_type_v3(); ndhal->ndhal_address_map.pci_host_base = V3_PCIE_A0_BASE; - ndhal->ndhal_address_map.mmap_p_offset = V3_MMAP_P_OFFSET; ndhal->ndhal_address_map.mmap_nc_event_offset = V3_MMAP_NC_EVENT_OFFSET; ndhal->ndhal_address_map.mmap_nc_sema_read_offset = V3_MMAP_NC_SEMA_READ_OFFSET; ndhal->ndhal_address_map.mmap_nc_sema_set_offset = V3_MMAP_NC_SEMA_SET_OFFSET; @@ -2087,7 +1882,6 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_address_map.mmap_nc_sema_decr_offset = V3_MMAP_NC_SEMA_DECR_OFFSET; ndhal->ndhal_address_map.bar0_misc_ram_offset = V3_MMAP_BAR0_APB_IO_0_MISC_RAM_OFFSET; ndhal->ndhal_address_map.port_1_base = 0ull; - ndhal->ndhal_address_map.mmap_nc_size = V3_MMAP_NC_SIZE; ndhal->ndhal_address_map.nc_per_device = V3_NC_PER_DEVICE; ndhal->ndhal_address_map.dev_nc_map = (1 << V3_NC_PER_DEVICE) - 1; ndhal->ndhal_address_map.dice_per_device = V3_NUM_DIE_PER_DEVICE; @@ -2096,7 +1890,6 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_address_map.ts_per_device = V3_TS_PER_DEVICE; ndhal->ndhal_address_map.dma_eng_per_nc = V3_DMA_ENG_PER_NC; ndhal->ndhal_address_map.dram_channels = V3_MAX_DRAM_CHANNELS; - ndhal->ndhal_reset.reset_poll_interval = V3_NR_RESET_POLL_INTERVAL; ndhal->ndhal_reset.initiate_max_wait_time = V3_NR_RESET_INIT_MAX_TOTAL_WAIT_TIME_MS; ndhal->ndhal_reset.retry_count = NR_RESET_RETRY_COUNT; ndhal->ndhal_reset.nr_post_reset_config = nr_post_reset_config_v3; @@ -2111,14 +1904,12 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_mpset.mp_min_alloc_size = (mempool_min_alloc_size < 1024) ? 1024 : mempool_min_alloc_size; ndhal->ndhal_mpset.small_pool_supported = true; ndhal->ndhal_mpset.mpset_set_dram_and_mpset_info = mpset_set_dram_and_mpset_info_v3; - ndhal->ndhal_mpset.mpset_block_carveout_regions = mpset_block_carveout_regions_v3; ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id = ndmar_get_h2t_eng_id_v3; ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid = ndmar_get_h2t_def_qid_v3; ndhal->ndhal_ndmar.ndmar_is_h2t_def_q = ndmar_is_h2t_def_q_v3; ndhal->ndhal_ndmar.nr_init_h2t_eng = nr_init_h2t_eng_v3; ndhal->ndhal_ndmar.ndmar_is_nx_ring = ndmar_is_nx_ring_v3; ndhal->ndhal_ndmar.ndmar_quiesce_queues = ndmar_quiesce_queues_v3; - ndhal->ndhal_ndmar.ndmar_set_model_started = ndmar_set_model_started_v3; ndhal->ndhal_fw_io.fw_io_topology = fw_io_topology_v3; ndhal->ndhal_fw_io.fw_io_register_readless_read_region = fw_io_register_readless_read_region_v3; ndhal->ndhal_fw_io.fw_io_read_csr_array = fw_io_read_csr_array_v3; @@ -2134,17 +1925,14 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_pci.axi_bar = BAR_UNUSED; ndhal->ndhal_pci.apb_bar = 0; ndhal->ndhal_pci.dram_bar = 4; - ndhal->ndhal_pci.neuron_pci_release_bar = neuron_pci_release_bar_v3; - ndhal->ndhal_pci.neuron_pci_reserve_bar = neuron_pci_reserve_bar_v3; - ndhal->ndhal_pci.neuron_pci_set_npdev = neuron_pci_set_npdev_v3; ndhal->ndhal_pci.neuron_pci_get_device_id = neuron_pci_get_device_id_v3; ndhal->ndhal_pci.neuron_pci_device_id_to_rid_map = neuron_pci_device_id_to_rid_map_v3; ndhal->ndhal_cdev.ncdev_mem_regions = ncdev_mem_regions_v3; ndhal->ndhal_cdev.ncdev_bar0_write_blocked_addrs = ncdev_bar0_write_blocked_addrs_v3; ndhal->ndhal_cdev.ncdev_compatible_version = ncdev_compatible_version_v3; - ndhal->ndhal_cdev.ncdev_quiesce_exec_on_proc_exit = ncdev_quiesce_exec_on_proc_exit_v3; ndhal->ndhal_cdev.ncdev_logical_to_physical_nc_map = ncdev_logical_to_physical_nc_map_v3; ndhal->ndhal_cdev.ncdev_get_default_tpbs_for_hbm = ncdev_get_default_tpbs_for_hbm_v3; + ndhal->ndhal_udma.num_queues = DMA_MAX_Q_V4; ndhal->ndhal_udma.num_beats = 2296; // allow up to 288 outstanding writes ndhal->ndhal_ndma.ndma_retry_memcpy = false; ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v3; @@ -2158,10 +1946,14 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_npe.npe_pod_status = npe_pod_status_v3; ndhal->ndhal_npe.npe_pod_ctrl = npe_pod_ctrl_v3; ndhal->ndhal_npe.npe_class_node_id_show_data = npe_class_node_id_show_data_v3; + ndhal->ndhal_npe.npe_class_node_cnt_show_data = npe_class_node_cnt_show_data_v3; ndhal->ndhal_npe.npe_class_server_id_show_data = npe_class_server_id_show_data_v3; ndhal->ndhal_npe.npe_class_ultraserver_mode_show_data = npe_class_ultraserver_mode_show_data_v3; ndhal->ndhal_npe.npe_neighbor_eng_ids = npe_neighbor_eng_ids_v3; ndhal->ndhal_perf.perf_set_profile = perf_set_profile_v3; + ndhal->ndhal_perf.perf_get_profile = perf_get_profile_v3; + ndhal->ndhal_perf.perf_get_supported_profiles = perf_get_supported_profiles_v3; + ndhal->ndhal_perf.perf_update_hbm_7200_supported = perf_update_hbm_7200_supported_v3; ndhal->ndhal_tpb.pe_xbus_count = 9; ndhal->ndhal_tpb.pe_row_grp_count = 4; ndhal->ndhal_tpb.pe_col_grp_count = 4; @@ -2181,8 +1973,6 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_reset.nr_wait_for_reset_completion = nr_wait_for_reset_completion_v3_qemu; ndhal->ndhal_address_map.seng_dma_eng_per_nd = V3_NC_PER_DEVICE * V3_DMA_ENG_PER_NC; ndhal->ndhal_address_map.h2d_dma_eng_per_nd = V3_NUM_H2D_DMA_PER_DEVICE; - ndhal->ndhal_reg_access.reg_read32_array = reg_read32_array_v3_qemu_emu; - ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v3_qemu; ndhal->ndhal_address_map.dice_per_device = 1; // Disable metrics on qemu @@ -2195,8 +1985,6 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_address_map.h2d_dma_eng_per_nd = nc_per_dev_param; ndhal->ndhal_address_map.nc_per_device = nc_per_dev_param; ndhal->ndhal_address_map.dev_nc_map = dev_nc_map; - ndhal->ndhal_reg_access.reg_read32_array = reg_read32_array_v3_qemu_emu; - ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v3_emu; ndhal->ndhal_address_map.dice_per_device = 1; // Disable metrics on emulation @@ -2206,7 +1994,6 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_reset.nr_wait_for_reset_completion = nr_wait_for_reset_completion_v3; ndhal->ndhal_address_map.seng_dma_eng_per_nd = V3_NC_PER_DEVICE * V3_DMA_ENG_PER_NC; ndhal->ndhal_address_map.h2d_dma_eng_per_nd = V3_NUM_H2D_DMA_PER_DEVICE; - ndhal->ndhal_reg_access.reg_read32_array = reg_read32_array_v3; } if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER) { diff --git a/v3/neuron_pelect.c b/v3/neuron_pelect.c index c00a579..26249c9 100644 --- a/v3/neuron_pelect.c +++ b/v3/neuron_pelect.c @@ -157,13 +157,9 @@ #include "../neuron_crwl.h" #include "neuron_pelect.h" -int userver_pds_node_cnt = 2; -module_param(userver_pds_node_cnt, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); -MODULE_PARM_DESC(userver_pds_node_cnt, "pds ultraserver node count"); - -int userver_pds_server_id = 0x0001; -module_param(userver_pds_server_id, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); -MODULE_PARM_DESC(userver_pds_server_id, "pds ultraserver id"); +int pds_reservation_id = 0x0001; +module_param(pds_reservation_id, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); +MODULE_PARM_DESC(pds_reservation_id, "pds reservation id"); /* Enable ultraserver auto election (4 node configuration) by default */ @@ -293,7 +289,7 @@ typedef struct pod_neighbor_io { struct mem_chunk *data_mc; } pod_neighbor_io_t; -static void npe_pds_spoof(void); +static void npe_pds_config_init(void); static bool npe_pod_ctl_is_set(int value) { @@ -1216,10 +1212,10 @@ int npe_election_exec_on_rst(struct neuron_device *nd, bool reset_successful) goto done; } - // spoof PDS topology/election data + // initialize PDS configuration (topology/election) data // if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) { - npe_pds_spoof(); + npe_pds_config_init(); goto done; } @@ -1272,6 +1268,11 @@ static int npe_get_modal_node_id(enum neuron_ultraserver_mode mode) { int node_id = ndhal_pelect_data.node_id; + // PDS doesn't change node_id based on mode because nodes id are location based vs. election based + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) { + return node_id; + } + switch (mode) { case NEURON_ULTRASERVER_MODE_UNSET: break; @@ -1426,6 +1427,31 @@ static bool npe_mode_is_supported(enum neuron_ultraserver_mode mode) return false; } +static enum neuron_ultraserver_mode npe_node_cnt_to_mode(int node_cnt) +{ + enum neuron_ultraserver_mode mode = NEURON_ULTRASERVER_MODE_UNSET; + + switch (node_cnt) { + case 0: + case 1: + mode = NEURON_ULTRASERVER_MODE_X1; + break; + case 2: + if (npe_mode_is_supported(NEURON_ULTRASERVER_MODE_X2H)) { + mode = NEURON_ULTRASERVER_MODE_X2H; + } else if (npe_mode_is_supported(NEURON_ULTRASERVER_MODE_X2V)) { + mode = NEURON_ULTRASERVER_MODE_X2V; + } + break; + case 4: + mode = NEURON_ULTRASERVER_MODE_X4; + break; + default: + break; + } + return mode; +} + /** * npe_get_pod_id() * @@ -1727,22 +1753,18 @@ static void npe_stop_thread(void) ssize_t npe_class_node_id_show_data(char *buf, u32 sz) { int node_id; - enum neuron_ultraserver_mode mode = NEURON_ULTRASERVER_MODE_X1; + enum neuron_ultraserver_mode mode; if (npe_pod_state_busy()) { return dhal_sysfs_emit(buf, "busy\n"); } - if (sz == 4) { - mode = NEURON_ULTRASERVER_MODE_X4; - } else if (sz == 2) { - if (npe_mode_is_supported(NEURON_ULTRASERVER_MODE_X2H)) { - mode = NEURON_ULTRASERVER_MODE_X2H; - } else if (npe_mode_is_supported(NEURON_ULTRASERVER_MODE_X2V)) { - mode = NEURON_ULTRASERVER_MODE_X2V; - } + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) { + mode = npe_node_cnt_to_mode(ndhal_pelect_data.node_cnt); + } else if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER) { + mode = npe_node_cnt_to_mode(sz); } else { - pr_err("Unexpected class entry: node_id_%d", sz); + pr_err("unexpected platform type %d", ndhal->ndhal_arch.platform_type); return dhal_sysfs_emit(buf, "invalid\n"); } @@ -1750,6 +1772,23 @@ ssize_t npe_class_node_id_show_data(char *buf, u32 sz) return dhal_sysfs_emit(buf, "%d\n", node_id); } +ssize_t npe_class_node_cnt_show_data(char *buf) +{ + int node_cnt = -1; // node_cnt is currently only returned for PDS + + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) { + node_cnt = ndhal_pelect_data.node_cnt; + } + + return dhal_sysfs_emit(buf, "%d\n", node_cnt); +} + +/** + * npe_class_server_id_show_data() + * + * return server id data for sysfs class node. PD and PDS + * have different server id data retrieval methodologies. + */ ssize_t npe_class_server_id_show_data(char *buf, u32 sz) { u64 pod_serial_number; @@ -1759,18 +1798,12 @@ ssize_t npe_class_server_id_show_data(char *buf, u32 sz) return dhal_sysfs_emit(buf, "0000000000000000\n"); } - if (sz == 4) { - mode = NEURON_ULTRASERVER_MODE_X4; - } else if (sz == 2) { - if (npe_mode_is_supported(NEURON_ULTRASERVER_MODE_X2H)) { - mode = NEURON_ULTRASERVER_MODE_X2H; - } else if (npe_mode_is_supported(NEURON_ULTRASERVER_MODE_X2V)) { - mode = NEURON_ULTRASERVER_MODE_X2V; - } - } else { - pr_err("Unexpected class entry: server_id_%d", sz); - return dhal_sysfs_emit(buf, "invalid\n"); + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) { + mode = npe_node_cnt_to_mode(ndhal_pelect_data.node_cnt); + } else if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER) { + mode = npe_node_cnt_to_mode(sz); } + pod_serial_number = npe_get_modal_serial_number(mode); return dhal_sysfs_emit(buf, "%016llx\n", pod_serial_number); @@ -1811,40 +1844,34 @@ struct { uint64_t d0_serial_number; // serial number of a particular device 0 on a particular server uint64_t server_num; // server unique id of the associated server uint32_t node_id; // (rack id<<1 | server id) + uint32_t node_cnt; // ultra-server nodes size. } npe_pds_tmp_mapping_tbl[] = { - {0x644b8499cd7bf298ull, 0x0000004005590701ull, 0}, - {0x001e8649a094af56ull, 0x0000004005590701ull, 1}, - {0x4b63b0678ae2a930ull, 0x0000004005590701ull, 2}, - {0x7242db0306415ed7ull, 0x0000004005590701ull, 3}, - {0x7e3a518befdf7a57ull, 0x0000004005590689ull, 0}, - {0x3c604484897a4f1aull, 0x0000004005590689ull, 1}, - {0xacfba8515bb626a6ull, 0x0000004005590689ull, 2}, - {0x48c2b73699e97cadull, 0x0000004005590689ull, 3}, - {0xa952ff53b45fc298ull, 0x0000004005590680ull, 0}, - {0x5961a8d75d827fc0ull, 0x0000004005590680ull, 1}, - {0x714cf1792facf83bull, 0x0000004005590680ull, 2}, - {0x9b3187e1756c8a7full, 0x0000004005590680ull, 3}, - {0x85059f2db248f3dfull, 0x0000004005590682ull, 0}, - {0xf4d2ef81ad1b1264ull, 0x0000004005590682ull, 1}, - {0x3d3ea5a61b768cbdull, 0x0000004005590682ull, 2}, - {0x85752a544054033aull, 0x0000004005590682ull, 3} + {0xf15812e4f3dc642cull, 0x0000004005590728ull, 0, 4}, //TRN3PDS US16 Node-1 Label swapped. + {0x51f84556b473ea1cull, 0x0000004005590728ull, 1, 4}, //TRN3PDS US16 Node-0 Label swapped. + {0xdd5da7090e13c984ull, 0x0000004005590728ull, 3, 4}, //TRN3PDS US16 Flipped the server_id(0) in rack1 + {0x2c82d5db4d1c3969ull, 0x0000004005590728ull, 2, 4}, //TRN3PDS US16 Flipped the server_id(1) in rack1 }; -/* npe_pds_spoof(void) +/* npe_pds_config_init(void) * - * temp spoof of PDS platform data + * Initialize pds configuration data. Configuration data consists of: + * - reservation_id - unique id indentifying all the instances belonging to this PDS reservation + * - node_id - node id for this node in the PDS server + * - node_cnt - count of nodes in the PDS server * */ -static void npe_pds_spoof(void) +static void npe_pds_config_init(void) { static bool initialized = false; - int ret; + int ret = 0; int i; struct neuron_device *nd; uint64_t serial_number; + int instance_sz; + int partition_sz; + int server_id; + int rack_id; - pr_info("spoofing pds data"); - if (initialized) { return; } @@ -1857,42 +1884,87 @@ static void npe_pds_spoof(void) return; } + // TODO remove temp mapping table logic + // ret = fw_io_serial_number_read(nd->npdev.bar0, &serial_number); if (ret) { pr_err("nd%02d: local serial number read failed", nd->device_index); return; } + // check temporary mapping table for PDS server data + // for (i = 0; i < sizeof(npe_pds_tmp_mapping_tbl) / sizeof(*npe_pds_tmp_mapping_tbl); i++) { if (serial_number == npe_pds_tmp_mapping_tbl[i].d0_serial_number) { - ndhal_pelect_data.node_cnt = 4; ndhal_pelect_data.node_id = npe_pds_tmp_mapping_tbl[i].node_id; ndhal_pelect_data.pod_serial_num = npe_pds_tmp_mapping_tbl[i].server_num; + ndhal_pelect_data.node_cnt = npe_pds_tmp_mapping_tbl[i].node_cnt; goto done; } } - // otherwise, we use temporary parameter overrides + // get PDS platform data (instance and partition size) to determine node cnt // - ndhal_pelect_data.node_cnt = userver_pds_node_cnt; + ret = fw_io_instance_partition_sz_read(nd->npdev.bar0, &instance_sz, &partition_sz); + if (ret) { + goto done; + } - if (ndhal_pelect_data.node_cnt == 0) { - ndhal_pelect_data.node_id = -1; - } else if (ndhal_pelect_data.node_cnt == 2) { - // node_cnt of 2 uses V-links - ndhal_pelect_data.lr_mask = 0x1; - ndhal_pelect_data.node_id = ndhal->ndhal_arch.server_id; - } else if (ndhal_pelect_data.node_cnt == 4) { - // TODO PDS add in rack id - ndhal_pelect_data.node_id = ndhal->ndhal_arch.server_id; + if ((partition_sz == -1) || (instance_sz <= 0)) { + pr_warn("PDS partition/instance size data is invalid (%d/%d), defaulting to 4 node PDS configuration", partition_sz, instance_sz); + ndhal_pelect_data.node_cnt = 4; } else { - ndhal_pelect_data.node_cnt = 0; - pr_err("invalid PDS node count of %d", ndhal_pelect_data.node_cnt); + ndhal_pelect_data.node_cnt = partition_sz / instance_sz; + } + + if (ndhal_pelect_data.node_cnt == 2) { + // node_cnt of 2 uses V-links (for mode selection) + ndhal_pelect_data.lr_mask = 0x1; + } + + // get PDS reservation id + // + ret = fw_io_reservation_id_read(nd->npdev.bar0, &ndhal_pelect_data.pod_serial_num); + if (ret) { + goto done; + } + + if (ndhal_pelect_data.pod_serial_num == 0) { + pr_warn("PDS server reservation id invalid (%llu), defaulting to 'pds_reservation_id' parameter value: %u", ndhal_pelect_data.pod_serial_num, pds_reservation_id); + ndhal_pelect_data.pod_serial_num = pds_reservation_id; + } + + // get PDS server id and rack id + // + ret = fw_io_server_info_read(nd->npdev.bar0, &server_id, &rack_id); + if (ret || (server_id == -1) || (rack_id == -1)) { + pr_warn("Unable to retrieve PDS server server/rack ids, making best effort guess"); + server_id = (server_id == -1) ? 0 : server_id; + rack_id = (rack_id == -1) ? 0 : rack_id; } - ndhal_pelect_data.pod_serial_num = userver_pds_server_id; + // map server/rack to node id. TODO covert to mapping function + // + switch ((rack_id << 1) | server_id) { + case 0: + ndhal_pelect_data.node_id = 0; + break; + case 1: + ndhal_pelect_data.node_id = 1; + break; + case 2: + ndhal_pelect_data.node_id = 3; + break; + case 3: + ndhal_pelect_data.node_id = 2; + break; + default: + ndhal_pelect_data.node_id = -1; + break; + } done: + // TODO - correctly report topology discovery/election failure once interfaces become more mature ndhal_pelect_data.pod_state_internal = NEURON_NPE_POD_ST_ELECTION_SUCCESS; initialized = true; diff --git a/v3/neuron_pelect.h b/v3/neuron_pelect.h index e0c9136..2e9f4a2 100644 --- a/v3/neuron_pelect.h +++ b/v3/neuron_pelect.h @@ -92,6 +92,13 @@ int npe_pod_ctrl(struct neuron_device *nd, u32 ctrl, enum neuron_ultraserver_mod */ ssize_t npe_class_node_id_show_data(char *buf, u32 sz); +/** + * npe_class_node_cnt_show_data() - return sysfs class node_cnt + * + * @buf: sysfs buffer + */ +ssize_t npe_class_node_cnt_show_data(char *buf); + /** * npe_class_server_id_show_data() - return sysfs class server_id * diff --git a/v3/notific.c b/v3/notific.c index ad66411..3c1e82a 100644 --- a/v3/notific.c +++ b/v3/notific.c @@ -9,8 +9,6 @@ #include "notific.h" -#define NOTIFIC_NQ_HEAD_OFFSET 0x10c - static u64 get_sdma_misc_base(int nc_id, int eng_id) { int seng_id = nc_id / V3_NC_PER_SENG; diff --git a/v3/notific.h b/v3/notific.h index be32e0b..ca16d99 100644 --- a/v3/notific.h +++ b/v3/notific.h @@ -19,6 +19,7 @@ */ #include "address_map.h" +#include "../neuron_nq.h" #include "../neuron_reg_access.h" /** Returns NOTIFIC relative offset for given the DMA engine for given NC. @@ -62,48 +63,3 @@ static inline u64 notific_get_relative_offset_topsp_v3(int ts_idx) int notific_decode_nq_head_reg_access_v3(u64 offset, u8 *nc_id, u32 *nq_type, u8 *instance, bool *is_top_sp); - - -/* - * COMMON with V2 need to move to shared include at some point - * - */ -#define NOTIFIC_NQ_SIZE 0x28 // total size of the NQ register space -#define NOTIFIC_NQ_BASE_ADDR_LO_OFFSET_START 0x100 -#define NOTIFIC_NQ_BASE_ADDR_LO_OFFSET(index) (NOTIFIC_NQ_BASE_ADDR_LO_OFFSET_START + ((index)*NOTIFIC_NQ_SIZE) + 0) - -#define NOTIFIC_NQ_BASE_ADDR_LO_RESET_VALUE 0x00000000 - -static inline void notific_write_nq_base_addr_lo(void __iomem *base, size_t index, - uint32_t value) -{ - const size_t offset = NOTIFIC_NQ_BASE_ADDR_LO_OFFSET(index); - - reg_write32(base + offset, value); -} - -#define NOTIFIC_NQ_BASE_ADDR_HI_OFFSET_START 0x104 -#define NOTIFIC_NQ_BASE_ADDR_HI_OFFSET(index) (NOTIFIC_NQ_BASE_ADDR_HI_OFFSET_START + ((index)*NOTIFIC_NQ_SIZE) + 0) - -#define NOTIFIC_NQ_BASE_ADDR_HI_RESET_VALUE 0x00000000 - -static inline void notific_write_nq_base_addr_hi(void __iomem *base, size_t index, - uint32_t value) -{ - const size_t offset = NOTIFIC_NQ_BASE_ADDR_HI_OFFSET(index); - - reg_write32(base + offset, value); -} - -#define NOTIFIC_NQ_F_SIZE_OFFSET_START 0x108 -#define NOTIFIC_NQ_F_SIZE_OFFSET(index) (NOTIFIC_NQ_F_SIZE_OFFSET_START + ((index)*NOTIFIC_NQ_SIZE) + 0) - -#define NOTIFIC_F_SIZE_RESET_VALUE 0x00000000 - -static inline void notific_write_nq_f_size(void __iomem *base, size_t index, - uint32_t value) -{ - const size_t offset = NOTIFIC_NQ_F_SIZE_OFFSET(index); - - reg_write32(base + offset, value); -} diff --git a/v4/address_map.h b/v4/address_map.h index 4391557..401925e 100644 --- a/v4/address_map.h +++ b/v4/address_map.h @@ -15,7 +15,6 @@ #define V4_PCIE_B0_3_BASE 0x1c000000000000ull // relative to nc -#define V4_MMAP_P_OFFSET 0x0000000d0000000ull #define V4_MMAP_NC_EVENT_OFFSET 0x00000002700000ull #define V4_MMAP_NC_SEMA_READ_OFFSET V4_MMAP_NC_EVENT_OFFSET + 0x00000000001000ull #define V4_MMAP_NC_SEMA_SET_OFFSET V4_MMAP_NC_EVENT_OFFSET + 0x00000000001400ull diff --git a/v4/neuron_dhal_v4.c b/v4/neuron_dhal_v4.c index 9ed4d3c..72b87c2 100644 --- a/v4/neuron_dhal_v4.c +++ b/v4/neuron_dhal_v4.c @@ -198,7 +198,7 @@ static bool ndhal_instance_type_3xl(void) * @param device_dram_addr: DRAM Channel 0 and 1's addresses * @param device_dram_size: DRAM Channel 0 and 1's sizes */ -static void mpset_set_dram_and_mpset_info_v4(struct mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size) +static void mpset_set_dram_and_mpset_info_v4(struct neuron_mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size) { mpset->num_channels = V4_MAX_DRAM_CHANNELS; mpset->mp_device_num_regions = 1; @@ -265,45 +265,8 @@ static int mmap_get_bar4_offset_v4(u64 start_addr, u64 size, u64 *offset) return 0; } -extern int dup_helper_enable; -static atomic_t dup_rid_cnt = ATOMIC_INIT(0); // count of duplicate routing IDs encountered -static int neuron_pci_handle_dup_routing_id(void) -{ - int ret = -ENODEV; - int dup_cnt; - char cmd[256]; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) - dup_cnt = atomic_fetch_add(1, &dup_rid_cnt); -#else - dup_cnt = atomic_add_return(1, &dup_rid_cnt) - 1; -#endif - - // If this is the first dup encounted, unload the driver - if ((dup_cnt == 0) && dup_helper_enable) { - pr_err("scheduling unload of %s due to duplicate routing id\n", module_name(THIS_MODULE)); - - int n = snprintf(cmd, sizeof(cmd), "sleep 10;/sbin/modprobe -r %s", module_name(THIS_MODULE)); - if (n > sizeof(cmd)) { - pr_err("unable to schedule driver unload cmd buffer len exceeded\n"); - return -EINVAL; - } - char *argv[] = { "/bin/sh", - "-c", - cmd, - NULL}; - static char *envp[] = { "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL}; - - ret = call_usermodehelper( argv[0], argv, envp, UMH_WAIT_EXEC); - if (ret) - pr_err("unable to schedule driver unload. Error: %d\n", ret); - } - return ret; -} +/* PCI Functions */ // for V4 rename Neuron devices for better customer experience. // see internal documentation: TRN2-Discovery @@ -373,18 +336,14 @@ static int neuron_pci_get_device_id_v4(struct neuron_device *nd, struct pci_dev } if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) { - u32 server_info = 0; - bool server_id_valid = 0; - u32 server_id = 0; - ret = fw_io_server_info_read(nd->npdev.bar0, &server_info); + int server_id; + + ret = fw_io_server_info_read(nd->npdev.bar0, &server_id, NULL); if (ret) { return -ENODEV; } - server_id_valid = (server_info >> 15) & 0x1; // TODO we probably need const shift value or macro - if (server_id_valid) { - server_id = server_info & 0x7fff; // TODO we probably need constant mask for this - } else { + if (server_id == -1) { pr_err("Could not retrieve valid server id, ret = %d\n", ret); return -ENODEV; } @@ -415,21 +374,21 @@ static int neuron_pci_get_device_id_v4(struct neuron_device *nd, struct pci_dev #define NC_MAPPING_MAX_CORE_COUNT_V4 128 static const struct neuron_ioctl_nc_map_entry nc_mapping_v0_seng_swap_pds[] = { { .device_id = 0, .device_nc_idx = 4 }, { .device_id = 0, .device_nc_idx = 5 }, { .device_id = 0, .device_nc_idx = 6 }, { .device_id = 0, .device_nc_idx = 7 }, { .device_id = 0, .device_nc_idx = 2 }, { .device_id = 0, .device_nc_idx = 3 }, { .device_id = 0, .device_nc_idx = 0 }, { .device_id = 0, .device_nc_idx = 1 }, // ND0 - { .device_id = 1, .device_nc_idx = 2 }, { .device_id = 1, .device_nc_idx = 3 }, { .device_id = 1, .device_nc_idx = 0 }, { .device_id = 1, .device_nc_idx = 1 }, { .device_id = 1, .device_nc_idx = 4 }, { .device_id = 1, .device_nc_idx = 5 }, { .device_id = 1, .device_nc_idx = 6 }, { .device_id = 1, .device_nc_idx = 7 }, // ND1 + { .device_id = 1, .device_nc_idx = 6 }, { .device_id = 1, .device_nc_idx = 7 }, { .device_id = 1, .device_nc_idx = 4 }, { .device_id = 1, .device_nc_idx = 5 }, { .device_id = 1, .device_nc_idx = 0 }, { .device_id = 1, .device_nc_idx = 1 }, { .device_id = 1, .device_nc_idx = 2 }, { .device_id = 1, .device_nc_idx = 3 }, // ND1 { .device_id = 2, .device_nc_idx = 4 }, { .device_id = 2, .device_nc_idx = 5 }, { .device_id = 2, .device_nc_idx = 6 }, { .device_id = 2, .device_nc_idx = 7 }, { .device_id = 2, .device_nc_idx = 2 }, { .device_id = 2, .device_nc_idx = 3 }, { .device_id = 2, .device_nc_idx = 0 }, { .device_id = 2, .device_nc_idx = 1 }, // ND2 - { .device_id = 3, .device_nc_idx = 2 }, { .device_id = 3, .device_nc_idx = 3 }, { .device_id = 3, .device_nc_idx = 0 }, { .device_id = 3, .device_nc_idx = 1 }, { .device_id = 3, .device_nc_idx = 4 }, { .device_id = 3, .device_nc_idx = 5 }, { .device_id = 3, .device_nc_idx = 6 }, { .device_id = 3, .device_nc_idx = 7 }, // ND3 + { .device_id = 3, .device_nc_idx = 6 }, { .device_id = 3, .device_nc_idx = 7 }, { .device_id = 3, .device_nc_idx = 4 }, { .device_id = 3, .device_nc_idx = 5 }, { .device_id = 3, .device_nc_idx = 0 }, { .device_id = 3, .device_nc_idx = 1 }, { .device_id = 3, .device_nc_idx = 2 }, { .device_id = 3, .device_nc_idx = 3 }, // ND3 { .device_id = 4, .device_nc_idx = 4 }, { .device_id = 4, .device_nc_idx = 5 }, { .device_id = 4, .device_nc_idx = 6 }, { .device_id = 4, .device_nc_idx = 7 }, { .device_id = 4, .device_nc_idx = 2 }, { .device_id = 4, .device_nc_idx = 3 }, { .device_id = 4, .device_nc_idx = 0 }, { .device_id = 4, .device_nc_idx = 1 }, // ND4 - { .device_id = 5, .device_nc_idx = 2 }, { .device_id = 5, .device_nc_idx = 3 }, { .device_id = 5, .device_nc_idx = 0 }, { .device_id = 5, .device_nc_idx = 1 }, { .device_id = 5, .device_nc_idx = 4 }, { .device_id = 5, .device_nc_idx = 5 }, { .device_id = 5, .device_nc_idx = 6 }, { .device_id = 5, .device_nc_idx = 7 }, // ND5 + { .device_id = 5, .device_nc_idx = 6 }, { .device_id = 5, .device_nc_idx = 7 }, { .device_id = 5, .device_nc_idx = 4 }, { .device_id = 5, .device_nc_idx = 5 }, { .device_id = 5, .device_nc_idx = 0 }, { .device_id = 5, .device_nc_idx = 1 }, { .device_id = 5, .device_nc_idx = 2 }, { .device_id = 5, .device_nc_idx = 3 }, // ND5 { .device_id = 6, .device_nc_idx = 4 }, { .device_id = 6, .device_nc_idx = 5 }, { .device_id = 6, .device_nc_idx = 6 }, { .device_id = 6, .device_nc_idx = 7 }, { .device_id = 6, .device_nc_idx = 2 }, { .device_id = 6, .device_nc_idx = 3 }, { .device_id = 6, .device_nc_idx = 0 }, { .device_id = 6, .device_nc_idx = 1 }, // ND6 - { .device_id = 7, .device_nc_idx = 2 }, { .device_id = 7, .device_nc_idx = 3 }, { .device_id = 7, .device_nc_idx = 0 }, { .device_id = 7, .device_nc_idx = 1 }, { .device_id = 7, .device_nc_idx = 4 }, { .device_id = 7, .device_nc_idx = 5 }, { .device_id = 7, .device_nc_idx = 6 }, { .device_id = 7, .device_nc_idx = 7 }, // ND7 + { .device_id = 7, .device_nc_idx = 6 }, { .device_id = 7, .device_nc_idx = 7 }, { .device_id = 7, .device_nc_idx = 4 }, { .device_id = 7, .device_nc_idx = 5 }, { .device_id = 7, .device_nc_idx = 0 }, { .device_id = 7, .device_nc_idx = 1 }, { .device_id = 7, .device_nc_idx = 2 }, { .device_id = 7, .device_nc_idx = 3 }, // ND7 { .device_id = 8, .device_nc_idx = 4 }, { .device_id = 8, .device_nc_idx = 5 }, { .device_id = 8, .device_nc_idx = 6 }, { .device_id = 8, .device_nc_idx = 7 }, { .device_id = 8, .device_nc_idx = 2 }, { .device_id = 8, .device_nc_idx = 3 }, { .device_id = 8, .device_nc_idx = 0 }, { .device_id = 8, .device_nc_idx = 1 }, // ND8 - { .device_id = 9, .device_nc_idx = 2 }, { .device_id = 9, .device_nc_idx = 3 }, { .device_id = 9, .device_nc_idx = 0 }, { .device_id = 9, .device_nc_idx = 1 }, { .device_id = 9, .device_nc_idx = 4 }, { .device_id = 9, .device_nc_idx = 5 }, { .device_id = 9, .device_nc_idx = 6 }, { .device_id = 9, .device_nc_idx = 7 }, // ND9 + { .device_id = 9, .device_nc_idx = 6 }, { .device_id = 9, .device_nc_idx = 7 }, { .device_id = 9, .device_nc_idx = 4 }, { .device_id = 9, .device_nc_idx = 5 }, { .device_id = 9, .device_nc_idx = 0 }, { .device_id = 9, .device_nc_idx = 1 }, { .device_id = 9, .device_nc_idx = 2 }, { .device_id = 9, .device_nc_idx = 3 }, // ND9 { .device_id = 10, .device_nc_idx = 4 }, { .device_id = 10, .device_nc_idx = 5 }, { .device_id = 10, .device_nc_idx = 6 }, { .device_id = 10, .device_nc_idx = 7 }, { .device_id = 10, .device_nc_idx = 2 }, { .device_id = 10, .device_nc_idx = 3 }, { .device_id = 10, .device_nc_idx = 0 }, { .device_id = 10, .device_nc_idx = 1 }, // ND10 - { .device_id = 11, .device_nc_idx = 2 }, { .device_id = 11, .device_nc_idx = 3 }, { .device_id = 11, .device_nc_idx = 0 }, { .device_id = 11, .device_nc_idx = 1 }, { .device_id = 11, .device_nc_idx = 4 }, { .device_id = 11, .device_nc_idx = 5 }, { .device_id = 11, .device_nc_idx = 6 }, { .device_id = 11, .device_nc_idx = 7 }, // ND11 + { .device_id = 11, .device_nc_idx = 6 }, { .device_id = 11, .device_nc_idx = 7 }, { .device_id = 11, .device_nc_idx = 4 }, { .device_id = 11, .device_nc_idx = 5 }, { .device_id = 11, .device_nc_idx = 0 }, { .device_id = 11, .device_nc_idx = 1 }, { .device_id = 11, .device_nc_idx = 2 }, { .device_id = 11, .device_nc_idx = 3 }, // ND11 { .device_id = 12, .device_nc_idx = 4 }, { .device_id = 12, .device_nc_idx = 5 }, { .device_id = 12, .device_nc_idx = 6 }, { .device_id = 12, .device_nc_idx = 7 }, { .device_id = 12, .device_nc_idx = 2 }, { .device_id = 12, .device_nc_idx = 3 }, { .device_id = 12, .device_nc_idx = 0 }, { .device_id = 12, .device_nc_idx = 1 }, // ND12 - { .device_id = 13, .device_nc_idx = 2 }, { .device_id = 13, .device_nc_idx = 3 }, { .device_id = 13, .device_nc_idx = 0 }, { .device_id = 13, .device_nc_idx = 1 }, { .device_id = 13, .device_nc_idx = 4 }, { .device_id = 13, .device_nc_idx = 5 }, { .device_id = 13, .device_nc_idx = 6 }, { .device_id = 13, .device_nc_idx = 7 }, // ND13 + { .device_id = 13, .device_nc_idx = 6 }, { .device_id = 13, .device_nc_idx = 7 }, { .device_id = 13, .device_nc_idx = 4 }, { .device_id = 13, .device_nc_idx = 5 }, { .device_id = 13, .device_nc_idx = 0 }, { .device_id = 13, .device_nc_idx = 1 }, { .device_id = 13, .device_nc_idx = 2 }, { .device_id = 13, .device_nc_idx = 3 }, // ND13 { .device_id = 14, .device_nc_idx = 4 }, { .device_id = 14, .device_nc_idx = 5 }, { .device_id = 14, .device_nc_idx = 6 }, { .device_id = 14, .device_nc_idx = 7 }, { .device_id = 14, .device_nc_idx = 2 }, { .device_id = 14, .device_nc_idx = 3 }, { .device_id = 14, .device_nc_idx = 0 }, { .device_id = 14, .device_nc_idx = 1 }, // ND14 - { .device_id = 15, .device_nc_idx = 2 }, { .device_id = 15, .device_nc_idx = 3 }, { .device_id = 15, .device_nc_idx = 0 }, { .device_id = 15, .device_nc_idx = 1 }, { .device_id = 15, .device_nc_idx = 4 }, { .device_id = 15, .device_nc_idx = 5 }, { .device_id = 15, .device_nc_idx = 6 }, { .device_id = 15, .device_nc_idx = 7 }, // ND15 + { .device_id = 15, .device_nc_idx = 6 }, { .device_id = 15, .device_nc_idx = 7 }, { .device_id = 15, .device_nc_idx = 4 }, { .device_id = 15, .device_nc_idx = 5 }, { .device_id = 15, .device_nc_idx = 0 }, { .device_id = 15, .device_nc_idx = 1 }, { .device_id = 15, .device_nc_idx = 2 }, { .device_id = 15, .device_nc_idx = 3 }, // ND15 }; #define NC_MAPPING_V0_SENG_SWAP_SIZE (sizeof(nc_mapping_v0_seng_swap_pds) / sizeof(nc_mapping_v0_seng_swap_pds[0])) @@ -457,6 +416,11 @@ static int ncdev_logical_to_physical_nc_map_v4(struct neuron_ioctl_nc_map *map, return 0; } +static void perf_update_hbm_7200_supported_v4(struct neuron_device *nd) { + nd->supports_hbm_7200 = 0; + return; +} + /** * ndhal_register_funcs_v4() - initialize the dhal for v4 chips * @@ -478,6 +442,7 @@ int ndhal_register_funcs_v4(void) { ndhal->ndhal_mmap.dm_mmap_special = dm_mmap_special_v4; ndhal->ndhal_mmap.mmap_get_bar4_offset = mmap_get_bar4_offset_v4; ndhal->ndhal_cdev.ncdev_mem_regions = ncdev_mem_regions_v4; + ndhal->ndhal_perf.perf_update_hbm_7200_supported = perf_update_hbm_7200_supported_v4; if (narch_is_emu()) { // Temporarily disable resets on emulation until support is ready