-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathbenchmark.py
More file actions
103 lines (86 loc) · 4.4 KB
/
benchmark.py
File metadata and controls
103 lines (86 loc) · 4.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# pylint: disable=line-too-long, invalid-name, too-many-locals, c-extension-no-member, redefined-outer-name
# built-in library
import sys
import os
import time
# third party library
import cv2
import cupy as cp
import numpy as np
import pandas as pd
from resize import cuda_resize
def main(input_array: cp.ndarray, resize_shape:tuple):
input_array_gpu = cp.empty(shape=input_array.shape,dtype=input_array.dtype)
if isinstance(input_array, cp.ndarray): # DtoD
cp.cuda.runtime.memcpy(dst = int(input_array_gpu.data), # dst_ptr
src = int(input_array.data), # src_ptr
size=input_array.nbytes,
kind=3) # 0: HtoH, 1: HtoD, 2: DtoH, 3: DtoD, 4: unified virtual addressing
elif isinstance(input_array, np.ndarray):
cp.cuda.runtime.memcpy(dst = int(input_array_gpu.data), # dst_ptr
src = input_array.ctypes.data, # src_ptr
size=input_array.nbytes,
kind=1)
resize_scale, top_pad, left_pad, output_array = cuda_resize(input_array_gpu,
resize_shape,
pad=False) # N,W,H,C
return output_array, [resize_scale, top_pad, left_pad]
def warm_up(shape):
w,h = shape
input_array_gpu = cp.ones(shape=(200,h,w,3),dtype=np.uint8)
_, _, _, output_array = cuda_resize(input_array_gpu,
(128,256),
pad=False) # N,W,H,C
print("Warm up:", output_array.shape)
if __name__ == "__main__":
# prepare data
batch = 100
size = [(3840,2160),(1920,1080), (960,540), (480,270), (240,135), (120,67), (60,33), (30,16)]
warm_up(size[0])
benchmark = pd.DataFrame(columns=[str(size_) for size_ in size],
index=[str(size_) for size_ in size])
# benchmark = defaultdict(dict)
for src_shape in size:
if os.path.exists(f"{src_shape}.npy"):
imgs = np.load(f"{src_shape}.npy")
else:
imgs = [cv2.resize(cv2.imread(f"val2017/{img_name}"),src_shape) for img_name in os.listdir("val2017")[:1000]]
imgs = np.asarray(imgs)
np.save(f"{src_shape}.npy",imgs)
for dst_shape in size:
# CPU benchmark
cpu_metrics = []
# start = time.perf_counter()
# for index in range(0, len(imgs), batch):
# start = time.perf_counter()
# cpu_output = [cv2.resize(img,(dst_shape))for img in imgs[index:index+batch]]
# cpu_metrics.append(time.perf_counter() - start)
# # cv2.imwrite(f"{index}_output_cpu.jpg", cpu_output[0])
# CUDA benchmark
cuda_metrics = []
for index in range(0, len(imgs), batch):
input_array = imgs[index:index+batch]
input_array_gpu = cp.empty(shape=input_array.shape,dtype=input_array.dtype)
cp.cuda.runtime.memcpy(dst = int(input_array_gpu.data), # dst_ptr
src = input_array.ctypes.data, # src_ptr
size=input_array.nbytes,
kind=1)
# input_array_gpu = cp.load(f"{src_shape}.npy")
# execution
start = time.perf_counter()
_, _, _, output_array = cuda_resize(input_array_gpu,
dst_shape[::-1],
pad=False) # N,W,H,C
cuda_metrics.append(time.perf_counter() - start)
# cv2.imwrite(f"{index}_output_cuda.jpg", cp.asnumpy(output_array[0]))
del input_array_gpu
cp.get_default_memory_pool().free_all_blocks()
cpu_ = sum(cpu_metrics)
gpu_ = sum(cuda_metrics)
speedup = cpu_/gpu_
# benchmark[f"{src_shape}"][f"{dst_shape}"] = speedup
benchmark[f"{src_shape}"][f"{dst_shape}"] = gpu_/1000 *1000 * 1000 # sum / batch * ms * us
# print(f"{src_shape} -> {dst_shape}: \t CPU: {cpu_} \t | CUDA: {gpu_} \t | Speedup: {speedup}")
# print(benchmark)
del imgs
print(benchmark)