NeuralCore/tensor_processor.h at main · SammaelA/NeuralCore · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#pragma once
#include <vector>
#include <cmath>
#include <map>
#include <string>
#include <memory>

class TensorProcessorImpl;
namespace nn
{
  struct TensorProgram
  {
    constexpr static unsigned MAX_DIM = 8;
    constexpr static unsigned CMD_ARGS = 8;
    constexpr static unsigned CONSTS_VAR_ID = 1;

    //each command is [type, A, B, C, arg0, arg1, arg2]
    //A,B - input variables
    //C - output variable
    enum CommandType
    {
      NOOP,     // do nothing

      MOV,      // memcpy(C,A, sizeof(float)*A.total_size)
      FILL,     // fill(C, as_float(arg0))
      COPY,     // memcpy(C+arg1, A+arg0, sizeof(float)*arg2)
      PAD,      // padding along the given axis
      FLIP,     // reverse order of values along the given axis
      DILATE,   // put some zero values between values from input tensor
      URAND,    // fills tensor with values, uniformly distributed in [0,1]

      ADD,      // C = A+B
      SUB,      // C = A-B
      MUL,      // C = A*B
      DIV,      // C = A/B
      GREATER,  // C = A > B
      LESS,     // C = A < B
      EQUAL,    // C = A == B (precisely)
      GE,       // C = A >= B
      LE,       // C = A <= B
      NE,       // C = A != B (precisely)
      OR,       // C = A>0 || B>0
      AND,      // C = A>0 && B>0
      WHERE,    // C = B>0 ? A : 0 (elementwise)
      MIN,      // C = min(A, B)
      MAX,      // C = min(A, B)
      POW,      // C = A^B

      EXP,      // C = exp(A)
      SQRT,     // C = sqrt(A)
      SIN,      // C = sin(A)
      COS,      // C = cos(A)
      LOG,      // C = log(A)
      NOT,      // C = A > 0 ? 0 : 1 (elementwise)

      SUM,      // C = sum(A)
      O_SUM,    // C = sum(A)
      MINIMUM,  // C = min(A)
      MAXIMUM,  // C = max(A)

      MATMUL_T, // C = Ax(B^T)
      TRANSP,   // C = transpose(A)
      OUTER_P,  // C = outer_product(A, B)
      SMAX_D,   // derivative of softmax function. It's complicated enough to have a separate command for it
      CONV_2D,  //convolution with arbitrary number of channels and filters. Borders are ignored
      MPOOL,    // C = max pooling(A) with arbitrary window size
      MPOOL_D,  // derivative of max pooling
      CONV_3D,  //convolution with arbitrary number of channels and filters. Borders are ignored
      MPOOL_3D, // C = 3D max pooling(A) with arbitrary window size
      MPOOL_3D_D,// derivative of 3D max pooling


      CMD_COUNT
    };

    enum CmdClass
    {
      AUXILIARY,
      MEM_MANAGEMENT,
      ARITHMETICS,
      ELEMENTWISE,
      REDUCTION,
      ALGEBRA,
      OTHER
    };

    enum CmdIsSelfApplicable
    {
      SELF_APPLICABLE_NO,
      SELF_APPLICABLE_YES
    };

    struct CmdProperties
    {
      CommandType type;
      std::string name;
      CmdClass cls;
      CmdIsSelfApplicable is_self_applicable;
    };

    struct Command
    {
      CommandType type;
      unsigned args[8]; //CMD_ARGS
    };

    struct Variable
    {
      unsigned Dim;
      unsigned offset;
      unsigned total_size;
      unsigned sizes[8]; //MAX_DIM
    };

    static std::vector<CmdProperties> cmd_properties;

    std::vector<Command> commands;
    std::vector<Variable> vars;
    std::vector<float> constants;

    unsigned total_memory_req;

    std::map<std::string, unsigned> input_vars;  //name -> var_id
    std::map<std::string, unsigned> output_vars; //name -> var_id
  };

  class TensorProcessor
  {
  public:
    enum class Backend
    {
      CPU,
      GPU
    };

    struct RuntimeSettings
    {
      bool use_coop_mat_mul = true;
    };

    static void init(Backend backend);
    //sets given program for execution. Initializes memory etc.
    static void set_program(const TensorProgram &program);
    //transfers data to input tensor with <name>
    //if <data_size> less that tensor size, remaining part is padded with zeros
    //all inputs should be set before execution
    static void set_input(const std::string &name, const float * const data, unsigned data_size);
    //transfers data from output tensor with <name> to given address
    //if <data_size> less that tensor size, only this part is tranfered
    static void get_output(const std::string &name, float *data, unsigned data_size);
    static void execute();
    static void print_execution_stat();
    static void set_runtime_settings(RuntimeSettings settings);
  private:
    TensorProcessor();
    std::shared_ptr<TensorProcessorImpl> pImpl;
    TensorProgram program;
    std::map<std::string, bool> input_prepared;
    bool program_prepared = false;
    Backend backend;
    RuntimeSettings settings;
  };
}