|
1 | | ---- third_party/fft1d/device/fft1d.cl 2017-05-09 22:47:43.000000000 +0000 |
2 | | -+++ sdaccel/fft1d/device/fft1d.cl 2017-09-12 19:21:02.120000000 +0000 |
3 | | -@@ -49 +49 @@ |
| 1 | +--- device/fft1d.cl 2018-02-12 17:54:56.000000000 +0000 |
| 2 | ++++ device/fft1d.cl 2018-07-23 20:57:12.414000000 +0000 |
| 3 | +@@ -46,7 +46,7 @@ |
| 4 | + // Include source code for an engine that produces 8 points each step |
| 5 | + #include "fft_8.cl" |
| 6 | + |
4 | 7 | -#pragma OPENCL EXTENSION cl_intel_channels : enable |
5 | 8 | +//#pragma OPENCL EXTENSION cl_intel_channels : enable |
6 | | -@@ -64 +64,9 @@ |
| 9 | + |
| 10 | + #include "../host/inc/fft_config.h" |
| 11 | + |
| 12 | +@@ -66,11 +66,19 @@ |
| 13 | + #define CONT_FACTOR (1 << LOG_CONT_FACTOR) |
| 14 | + |
| 15 | + // Need some depth to our channels to accomodate their bursty filling. |
7 | 16 | -channel float2 chanin[8] __attribute__((depth(CONT_FACTOR*8))); |
8 | 17 | + |
9 | 18 | +pipe float2 chanin0 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8))); |
|
14 | 23 | +pipe float2 chanin5 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8))); |
15 | 24 | +pipe float2 chanin6 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8))); |
16 | 25 | +pipe float2 chanin7 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8))); |
17 | | -@@ -68 +76 @@ |
18 | | -- #pragma unroll |
19 | | -+ __attribute__((opencl_unroll_hint())) |
20 | | -@@ -137,2 +145,2 @@ |
| 26 | + |
| 27 | + uint bit_reversed(uint x, uint bits) { |
| 28 | + uint y = 0; |
| 29 | +- #pragma unroll |
| 30 | ++ __attribute__((opencl_unroll_hint())) |
| 31 | + for (uint i = 0; i < bits; i++) { |
| 32 | + y <<= 1; |
| 33 | + y |= x & 1; |
| 34 | +@@ -139,15 +147,15 @@ |
| 35 | + } |
| 36 | + |
| 37 | + // group dimension (N/(8*CONT_FACTOR), num_iterations) |
21 | 38 | -__attribute__((reqd_work_group_size(CONT_FACTOR * POINTS, 1, 1))) |
22 | 39 | -kernel void fetch (global float2 * restrict src) { |
23 | 40 | +kernel __attribute__((reqd_work_group_size(CONT_FACTOR * POINTS, 1, 1))) |
24 | 41 | +void fetch (global float2 * restrict src) { |
25 | | -@@ -145 +153 @@ |
| 42 | + |
| 43 | + const int N = (1 << LOGN); |
| 44 | + // Each thread will fetch POINTS points. Need POINTS times to pass to FFT. |
| 45 | + const int BUF_SIZE = 1 << (LOG_CONT_FACTOR + LOGPOINTS + LOGPOINTS); |
| 46 | + |
| 47 | + // Local memory for CONT_FACTOR * POINTS points |
26 | 48 | - local float2 buf[BUF_SIZE]; |
27 | | -+ local float2 buf[BUF_SIZE] __attribute__((xcl_array_partition(block,8,1))); |
28 | | -@@ -156,2 +164,2 @@ |
| 49 | ++ __local float2 buf[BUF_SIZE] __attribute__((xcl_array_partition(block,8,1))); |
| 50 | + |
| 51 | + uint iteration = get_global_id(1); |
| 52 | + uint group_per_iter = get_global_id(0); |
| 53 | +@@ -158,17 +166,17 @@ |
| 54 | + uint lid = get_local_id(0); |
| 55 | + uint local_addr = lid << LOGPOINTS; |
| 56 | + |
29 | 57 | - #pragma unroll |
30 | 58 | - for (uint k = 0; k < POINTS; k++) { |
31 | 59 | + __attribute__((opencl_unroll_hint())) |
32 | 60 | + for (uint k = 0; k < POINTS; k+=2) { |
33 | | -@@ -160 +167,0 @@ |
34 | | -- |
35 | | -@@ -163,4 +170,4 @@ |
| 61 | + buf[local_addr + k] = src[global_addr + k]; |
| 62 | + } |
| 63 | + |
| 64 | + barrier (CLK_LOCAL_MEM_FENCE); |
| 65 | + |
36 | 66 | - #pragma unroll |
37 | 67 | - for (uint k = 0; k < POINTS; k++) { |
38 | 68 | - uint buf_addr = bit_reversed(k,3) * CONT_FACTOR * POINTS + lid; |
39 | 69 | - write_channel_intel (chanin[k], buf[buf_addr]); |
40 | 70 | + uint buf_addr[8]; |
41 | 71 | + __attribute__((opencl_unroll_hint())) |
42 | | -+ for(uint k=0;k<8;k++) { |
| 72 | ++ for (uint k = 0; k < 8; k++) { |
43 | 73 | + buf_addr[k] = bit_reversed(k,3) * CONT_FACTOR * POINTS + lid; |
44 | | -@@ -167,0 +175,12 @@ |
| 74 | + } |
| 75 | + } |
| 76 | + |
| 77 | +@@ -181,9 +189,24 @@ |
| 78 | + * 'count' represents the number of 4k sets to process |
| 79 | + * 'inverse' toggles between the direct and the inverse transform |
| 80 | + */ |
45 | 81 | + // bit_reversed reverses the bit locations of the value given. |
46 | 82 | + // The second parameter is the width of the number (in bits) to reverse. |
47 | 83 | + // Only the non-symmetric numbers are changed. E.g. 001,011,100,110 -> 100,110,100,110 |
48 | | -+ write_pipe (chanin0, &buf[buf_addr[0]]); |
49 | | -+ write_pipe (chanin1, &buf[buf_addr[1]]); |
50 | | -+ write_pipe (chanin2, &buf[buf_addr[2]]); |
51 | | -+ write_pipe (chanin3, &buf[buf_addr[3]]); |
52 | | -+ write_pipe (chanin4, &buf[buf_addr[4]]); |
53 | | -+ write_pipe (chanin5, &buf[buf_addr[5]]); |
54 | | -+ write_pipe (chanin6, &buf[buf_addr[6]]); |
55 | | -+ write_pipe (chanin7, &buf[buf_addr[7]]); |
56 | 84 | + |
57 | | -@@ -180,2 +199,2 @@ |
| 85 | ++ |
| 86 | ++ |
| 87 | ++ write_pipe(chanin0, &buf[buf_addr[0]]); |
| 88 | ++ write_pipe(chanin1, &buf[buf_addr[1]]); |
| 89 | ++ write_pipe(chanin2, &buf[buf_addr[2]]); |
| 90 | ++ write_pipe(chanin3, &buf[buf_addr[3]]); |
| 91 | ++ write_pipe(chanin4, &buf[buf_addr[4]]); |
| 92 | ++ write_pipe(chanin5, &buf[buf_addr[5]]); |
| 93 | ++ write_pipe(chanin6, &buf[buf_addr[6]]); |
| 94 | ++ write_pipe(chanin7, &buf[buf_addr[7]]); |
| 95 | ++ |
| 96 | + |
58 | 97 | -__attribute((task)) |
59 | 98 | -kernel void fft1d(global float2 * restrict dest, |
60 | 99 | +kernel __attribute((reqd_work_group_size(1, 1, 1))) //task)) |
61 | 100 | +void fft1d(global float2 * restrict dest, |
62 | | -@@ -218,8 +237,9 @@ |
| 101 | + int count, int inverse) { |
| 102 | + |
| 103 | + const int N = (1 << LOGN); |
| 104 | +@@ -220,14 +243,14 @@ |
| 105 | + float2x8 data; |
| 106 | + // Perform memory transfers only when reading data in range |
| 107 | + if (i < count * (N / 8)) { |
63 | 108 | - data.i0 = read_channel_intel(chanin[0]); |
64 | 109 | - data.i1 = read_channel_intel(chanin[1]); |
65 | 110 | - data.i2 = read_channel_intel(chanin[2]); |
|
68 | 113 | - data.i5 = read_channel_intel(chanin[5]); |
69 | 114 | - data.i6 = read_channel_intel(chanin[6]); |
70 | 115 | - data.i7 = read_channel_intel(chanin[7]); |
71 | | -+ |
72 | 116 | + read_pipe(chanin0,&data.i0); |
73 | 117 | + read_pipe(chanin1,&data.i1); |
74 | 118 | + read_pipe(chanin2,&data.i2); |
|
77 | 121 | + read_pipe(chanin5,&data.i5); |
78 | 122 | + read_pipe(chanin6,&data.i6); |
79 | 123 | + read_pipe(chanin7,&data.i7); |
| 124 | + } else { |
| 125 | + data.i0 = data.i1 = data.i2 = data.i3 = |
| 126 | + data.i4 = data.i5 = data.i6 = data.i7 = 0; |
0 commit comments