diff --git a/finn_examples/bitfiles/xilinx_u55c_gen3x16_xdma_3_202210_1/yolov8n_4w4a_coco.xclbin b/finn_examples/bitfiles/xilinx_u55c_gen3x16_xdma_3_202210_1/yolov8n_4w4a_coco.xclbin new file mode 100644 index 00000000..d0eee69c Binary files /dev/null and b/finn_examples/bitfiles/xilinx_u55c_gen3x16_xdma_3_202210_1/yolov8n_4w4a_coco.xclbin differ diff --git a/finn_examples/models.py b/finn_examples/models.py index 0db17ecd..d2c00723 100644 --- a/finn_examples/models.py +++ b/finn_examples/models.py @@ -34,6 +34,7 @@ from qonnx.core.datatype import DataType from finn_examples.driver import FINNExampleOverlay +from finn_examples.yolov8.yolov8 import DetectorDriver _mnist_fc_io_shape_dict = { "idt": [DataType["UINT8"]], @@ -99,6 +100,27 @@ "num_outputs": 1, } +_yolov8n_4w4a_coco_io_shape_dict = { + # FINN DataType for input and output tensors + "idt": [DataType["UINT8"]], + "odt": [DataType["INT21"], DataType["INT21"], DataType["INT21"]], + # shapes for input and output tensors (NHWC layout) + "ishape_normal": [(1, 192, 320, 3)], + "oshape_normal": [(1, 24, 40, 144), (1, 12, 20, 144), (1, 6, 10, 144)], + # folded / packed shapes below depend on idt/odt and input/output + # PE/SIMD parallelization settings -- these are calculated by the + # FINN compiler. + "ishape_folded": [(1, 192, 320, 3, 1)], + "oshape_folded": [(1, 24, 40, 144, 1), (1, 12, 20, 144, 1), (1, 6, 10, 144, 1)], + "ishape_packed": [(1, 192, 320, 3, 1)], + "oshape_packed": [(1, 24, 40, 144, 3), (1, 12, 20, 144, 3), (1, 6, 10, 144, 3)], + "input_dma_name": ["idma0"], + "output_dma_name": ["odma0", "odma1", "odma2"], + "number_of_external_weights": 0, + "num_inputs": 1, + "num_outputs": 3, +} + _imagenet_top5inds_io_shape_dict = { "idt": [DataType["UINT8"]], "odt": [DataType["UINT16"]], @@ -333,6 +355,17 @@ def bincop_cnv(target_platform=None, bitfile_path=None): return FINNExampleOverlay(filename, driver_mode, _bincop_cnv_io_shape_dict) +def yolov8n_4w4a_coco(target_platform=None, bitfile_path=None, batch_size=1): + target_platform = resolve_target_platform(target_platform) + driver_mode = get_driver_mode() + model_name = "yolov8n_4w4a_coco" + filename = find_bitfile(model_name, target_platform, bitfile_path) + quant_tail_params_dir = pk.resource_filename("finn_examples", "yolov8") + return DetectorDriver( + filename, driver_mode, _yolov8n_4w4a_coco_io_shape_dict, quant_tail_params_dir, batch_size + ) + + def mobilenetv1_w4a4_imagenet(target_platform=None, bitfile_path=None, rt_weights_path=None): target_platform = resolve_target_platform(target_platform) driver_mode = get_driver_mode() diff --git a/finn_examples/notebooks/8_yolov8_object_detection.ipynb b/finn_examples/notebooks/8_yolov8_object_detection.ipynb new file mode 100644 index 00000000..11cc9866 --- /dev/null +++ b/finn_examples/notebooks/8_yolov8_object_detection.ipynb @@ -0,0 +1,86 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import cv2\n", + "from finn_examples import models" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Initialize the Accelerator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The yolov8 is pre-build only for alveo u55c at the moment\n", + "accel = models.yolov8n_4w4a_coco()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run Inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "img = cv2.imread('images/test.jpg')\n", + "detections = accel.single_inference(img, conf_thres=0.3) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Visualize output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "visualized_img = accel.visualize(img, detections)\n", + "cv2.imwrite('demo.jpg', visualized_img)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/finn_examples/yolov8/Add_0_param0 b/finn_examples/yolov8/Add_0_param0 new file mode 100644 index 00000000..ce2f5612 Binary files /dev/null and b/finn_examples/yolov8/Add_0_param0 differ diff --git a/finn_examples/yolov8/Add_1_param0 b/finn_examples/yolov8/Add_1_param0 new file mode 100644 index 00000000..2b7ac98b Binary files /dev/null and b/finn_examples/yolov8/Add_1_param0 differ diff --git a/finn_examples/yolov8/Add_2_param0 b/finn_examples/yolov8/Add_2_param0 new file mode 100644 index 00000000..5e7c2b63 Binary files /dev/null and b/finn_examples/yolov8/Add_2_param0 differ diff --git a/finn_examples/yolov8/Mul_0_param0 b/finn_examples/yolov8/Mul_0_param0 new file mode 100644 index 00000000..645a89c3 Binary files /dev/null and b/finn_examples/yolov8/Mul_0_param0 differ diff --git a/finn_examples/yolov8/Mul_1_param0 b/finn_examples/yolov8/Mul_1_param0 new file mode 100644 index 00000000..d5a2d33f Binary files /dev/null and b/finn_examples/yolov8/Mul_1_param0 differ diff --git a/finn_examples/yolov8/Mul_2_param0 b/finn_examples/yolov8/Mul_2_param0 new file mode 100644 index 00000000..84e6c74e Binary files /dev/null and b/finn_examples/yolov8/Mul_2_param0 differ diff --git a/finn_examples/yolov8/yolov8.py b/finn_examples/yolov8/yolov8.py new file mode 100644 index 00000000..8eb768a9 --- /dev/null +++ b/finn_examples/yolov8/yolov8.py @@ -0,0 +1,424 @@ +import cv2 +import numpy as np +import random +from os.path import join + +from finn_examples.driver import FINNExampleOverlay + +COCO_LABELS = [ + "person", + "bicycle", + "car", + "motorcycle", + "airplane", + "bus", + "train", + "truck", + "boat", + "traffic light", + "fire hydrant", + "stop sign", + "parking meter", + "bench", + "bird", + "cat", + "dog", + "horse", + "sheep", + "cow", + "elephant", + "bear", + "zebra", + "giraffe", + "backpack", + "umbrella", + "handbag", + "tie", + "suitcase", + "frisbee", + "skis", + "snowboard", + "sports ball", + "kite", + "baseball bat", + "baseball glove", + "skateboard", + "surfboard", + "tennis racket", + "bottle", + "wine glass", + "cup", + "fork", + "knife", + "spoon", + "bowl", + "banana", + "apple", + "sandwich", + "orange", + "broccoli", + "carrot", + "hot dog", + "pizza", + "donut", + "cake", + "chair", + "couch", + "potted plant", + "bed", + "dining table", + "toilet", + "tv", + "laptop", + "mouse", + "remote", + "keyboard", + "cell phone", + "microwave", + "oven", + "toaster", + "sink", + "refrigerator", + "book", + "clock", + "vase", + "scissors", + "teddy bear", + "hair drier", + "toothbrush", +] + + +class DetectorDriver: + def __init__( + self, + bitfile_name, + platform, + io_shape_dict, + quant_tail_params_dir, + batch_size=1, + runtime_weight_dir="runtime_weights/", + device=None, + ): + self.accel = FINNExampleOverlay( + bitfile_name=bitfile_name, + platform=platform, + io_shape_dict=io_shape_dict, + batch_size=batch_size, + runtime_weight_dir=runtime_weight_dir, + device=device, + ) + self.io_shape_dict = io_shape_dict + self.batch_size = batch_size + self.stride = [8, 16, 32] + self.num_classes = 80 + self.dfl_regression_space = 16 + self.num_outputs = self.dfl_regression_space * 4 + self.num_classes + + self.muls = [ + np.load(join(quant_tail_params_dir, "Mul_{}_param0".format(i))) + for i in range(io_shape_dict["num_outputs"]) + ] + self.adds = [ + np.load(join(quant_tail_params_dir, "Add_{}_param0".format(i))) + for i in range(io_shape_dict["num_outputs"]) + ] + self.outputs = [ + [np.zeros(1) for _ in range(io_shape_dict["num_outputs"])] for b in range(batch_size) + ] + self.preproc_img_size = io_shape_dict["ishape_normal"][0][2] + + self.make_anchors() + + def single_inference(self, img, conf_thres=0.2): + batch = [img] + self.preproc_and_write_accel(batch) + self.execute_accel(asynch=True) + self.wait_until_accel_finished() + detections = self.read_accel_and_postprocess(conf_thres=conf_thres)[0] + return detections + + def visualize(self, img, detections): + img = img.copy() + detections[:, :4] = scale_coords( + self.io_shape_dict["ishape_normal"][0][1:3], detections[:, :4], img.shape[:2] + ) + for *xyxy, conf, cls in reversed(detections): + label = COCO_LABELS[int(cls)] + " {:.2f}".format(conf) + plot_one_box(xyxy, img, label=label, color=(0, 0, 255), line_thickness=1) + return img + + def execute_accel(self, asynch=False): + self.accel.execute_on_buffers(asynch=asynch) + + def wait_until_accel_finished(self): + self.accel.wait_until_finished() + for o in range(self.io_shape_dict["num_outputs"]): + self.accel.copy_output_data_from_device(self.accel.obuf_packed[o], ind=o) + + def make_anchors(self, grid_cell_offset=0.5): + """Generate anchors from features.""" + anchor_points, stride_tensor = [], [] + output_shapes = self.io_shape_dict["oshape_normal"] + for i, stride in enumerate(self.stride): + _, h, w, _ = output_shapes[i] + sx = np.arange(start=grid_cell_offset, stop=w, step=1) + sy = np.arange(start=grid_cell_offset, stop=h, step=1) + sx, sy = np.meshgrid(sx, sy) + anchor_points.append(np.stack((sx, sy), -1).reshape((-1, 2))) + stride_tensor.append([stride] * (h * w)) + + self.anchor_points = np.expand_dims(np.concatenate(anchor_points).transpose(1, 0), 0) + self.strides_tensor = np.concatenate(stride_tensor) + + def yolov8_postproc(self, outs, batch_size, anchor_points, strides, conf_thres=0.2): + dfl_integration_weights = np.arange(self.dfl_regression_space).reshape(1, -1, 1, 1) + x_cat = np.concatenate([out.reshape(batch_size, self.num_outputs, -1) for out in outs], 2) + boxes_classes = np.split(x_cat, [self.dfl_regression_space * 4], axis=1) + boxes, classes = boxes_classes + classes = 1 / (1 + np.exp(-classes)) + + # DFL expected value + boxes = boxes.reshape(batch_size, 4, self.dfl_regression_space, -1).transpose(0, 2, 1, 3) + exp_boxes = np.exp(boxes) + boxes = exp_boxes / np.sum(exp_boxes, axis=1) + boxes *= dfl_integration_weights + boxes = np.sum(boxes, 1) + + # decode bboxes + left_top_right_bottom = np.split(boxes, 2, axis=1) + lt = left_top_right_bottom[0] + rb = left_top_right_bottom[1] + x1y1 = anchor_points - lt + x2y2 = anchor_points + rb + # center_xy = (x1y1 + x2y2) / 2 + # wh = x2y2 - x1y1 + # boxes = np.concatenate((center_xy, wh), 1) * strides + boxes = np.concatenate((x1y1, x2y2), 1) * strides + pred = np.concatenate((boxes, classes), 1) + + # nms + pred = self.V8_non_max_suppression(pred, conf_thres=conf_thres) + + return pred + + def read_accel_and_postprocess(self, conf_thres=0.2): + for o in range(self.io_shape_dict["num_outputs"]): + # np.save(outputfile[o], obuf) + # self.accel.copy_output_data_from_device(self.accel.obuf_packed[o], ind=o) + obuf_folded = self.accel.unpack_output(self.accel.obuf_packed[o], ind=o) + obuf_normal = self.accel.unfold_output(obuf_folded, ind=o) + out = obuf_normal.transpose(0, 3, 1, 2) + out *= self.muls[o] + out += self.adds[o] + for in_batch_idx, single_output in enumerate(out): + self.outputs[in_batch_idx][o] = single_output + + batch_detections = [] + for outs_idx, outs in enumerate(self.outputs): + batch_detections.append( + self.yolov8_postproc( + outs, 1, self.anchor_points, self.strides_tensor, conf_thres=conf_thres + )[0] + ) + + return batch_detections + + def letterbox( + self, + img, + new_shape=(640, 640), + color=(114, 114, 114), + auto=False, + scaleFill=False, + scaleup=True, + ): + # Resize and pad image while meeting stride-multiple constraints + stride = max(self.stride) + shape = img.shape[:2] # current shape [height, width] + if isinstance(new_shape, int): + new_shape = (new_shape, new_shape) + + # Scale ratio (new / old) + r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) + if not scaleup: # only scale down, do not scale up (for better test mAP) + r = min(r, 1.0) + + # Compute padding + new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) + dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding + if auto: # minimum rectangle + dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding + elif scaleFill: # stretch + dw, dh = 0.0, 0.0 + new_unpad = (new_shape[1], new_shape[0]) + + dw /= 2 # divide padding into 2 sides + dh /= 2 + + if shape[::-1] != new_unpad: # resize + img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR) + top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) + left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) + img = cv2.copyMakeBorder( + img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color + ) # add border + img = img[:, :, ::-1] # BGR to RGB, to 3x416x416 + + return img + + def preproc_and_write_accel(self, batch): + preprocessed_batch = [] + for i in range(self.batch_size): + img = self.letterbox(batch[i], self.io_shape_dict["ishape_normal"][0][1:3]) + preprocessed_batch.append(np.expand_dims(img, axis=0)) + ibuf_normal = np.concatenate(preprocessed_batch, axis=0) + + ibuf_folded = self.accel.fold_input(ibuf_normal) + ibuf_packed = self.accel.pack_input(ibuf_folded) + self.accel.copy_input_data_to_device(ibuf_packed) + + def V8_non_max_suppression( + self, + prediction, + conf_thres=0.2, + iou_thres=0.45, + classes=None, + agnostic=False, + multi_label=True, + labels=(), + max_det=300, + nc=0, # number of classes (optional) + max_time_img=0.05, + max_nms=30000, + max_wh=7680, + in_place=True, + ): + def nms(boxes, scores, overlap_threshold=0.5, min_mode=False): + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + index_array = scores.argsort()[::-1] + keep = [] + while index_array.size > 0: + keep.append(index_array[0]) + x1_ = np.maximum(x1[index_array[0]], x1[index_array[1:]]) + y1_ = np.maximum(y1[index_array[0]], y1[index_array[1:]]) + x2_ = np.minimum(x2[index_array[0]], x2[index_array[1:]]) + y2_ = np.minimum(y2[index_array[0]], y2[index_array[1:]]) + + w = np.maximum(0.0, x2_ - x1_ + 1) + h = np.maximum(0.0, y2_ - y1_ + 1) + inter = w * h + + if min_mode: + overlap = inter / np.minimum(areas[index_array[0]], areas[index_array[1:]]) + else: + overlap = inter / (areas[index_array[0]] + areas[index_array[1:]] - inter) + + inds = np.where(overlap <= overlap_threshold)[0] + index_array = index_array[inds + 1] + return keep + + bs = prediction.shape[0] # batch size (BCN, i.e. 1,84,6300) + nc = nc or (prediction.shape[1] - 4) # number of classes + xc = np.max(prediction[:, 4:], axis=1) > conf_thres + + prediction = prediction.transpose(0, 2, 1) + output = [np.zeros((0, 6))] * bs + for xi, x in enumerate(prediction): # image index, image inference + x = x[xc[xi]] # confidence + + # If none remain process next image + if not x.shape[0]: + continue + + # Detections matrix nx6 (xyxy, conf, cls) + box_cls = np.split(x, [4], axis=1) + box = box_cls[0] + cls = box_cls[1] + + j = cls.argmax(1, keepdims=True) + conf = np.take_along_axis(x[:, 4:], j, axis=1) + x = np.concatenate((box, conf, j), 1) + + # Filter by class + if classes is not None: + x = x[(x[:, 5:6] == classes).any(1)] + + # Check shape + n = x.shape[0] # number of boxes + if not n: # no boxes + continue + if n > max_nms: # excess boxes + x = x[ + x[:, 4].argsort(descending=True)[:max_nms] + ] # sort by confidence and remove excess boxes + + # Batched NMS + c = x[:, 5:6] * (0 if agnostic else max_wh) # classes + scores = x[:, 4] # scores + boxes = x[:, :4] + c # boxes (offset by class) + i = nms(boxes, scores, iou_thres) + i = i[:max_det] # limit detections + + output[xi] = x[i] + + return output + + +def plot_one_box(x, img, color=None, label=None, line_thickness=3): + # Plots one bounding box on image img + tl = ( + line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 + ) # line/font thickness + color = color or [random.randint(0, 255) for _ in range(3)] + c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) + cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) + if label: + tf = max(tl - 1, 1) # font thickness + t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] + c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 + cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled + cv2.putText( + img, + label, + (c1[0], c1[1] - 2), + 0, + tl / 3, + [225, 255, 255], + thickness=tf, + lineType=cv2.LINE_AA, + ) + + +def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None): + # print(img1_shape, img0_shape) + # Rescale coords (xyxy) from img1_shape to img0_shape + if ratio_pad is None: # calculate from img0_shape + gain = min( + img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1] + ) # gain = old / new + pad = (img1_shape[1] - img0_shape[1] * gain) / 2, ( + img1_shape[0] - img0_shape[0] * gain + ) / 2 # wh padding + else: + gain = ratio_pad[0][0] + pad = ratio_pad[1] + + coords[:, [0, 2]] -= pad[0] # x padding + coords[:, [1, 3]] -= pad[1] # y padding + coords[:, :4] /= gain + clip_coords(coords, img0_shape) + return coords + + +def clip_coords(boxes, img_shape): + # Clip bounding xyxy bounding boxes to image shape (height, width) + boxes[:, 0] = np.clip(boxes[:, 0], 0, img_shape[1]) # x1 + boxes[:, 1] = np.clip(boxes[:, 1], 0, img_shape[0]) # y1 + boxes[:, 2] = np.clip(boxes[:, 2], 0, img_shape[1]) # x2 + boxes[:, 3] = np.clip(boxes[:, 3], 0, img_shape[0]) # y2