-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmodel.py
More file actions
224 lines (210 loc) · 11.8 KB
/
model.py
File metadata and controls
224 lines (210 loc) · 11.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import math
from settings import *
from factors import *
from levels import *
from prints import *
from utils import *
from arch import *
"""
Entry point for the analytical model.
Updates the MOPs and Latency data of each level w.r.t. the current mapping.
"""
def updateStats(arch : Arch, bias_read : bool) -> tuple[float, int]:
assert arch.initialized, f"Arch {arch.name}: architecture not initialized, ensure to call 'initFactors' first."
# MOPs:
WMOPs = 0
temporal_iterations = 1
spatial_iterations = 1
last_in_reads, last_w_reads, last_out_reads, last_out_writes = 0, 0, 0, 0
acc_out_reads_factors = 1
# NOTE: here we compute total MOPs, not per-instance
for i in range(len(arch)):
level = arch[i]
if isinstance(level, MemLevel):
# multiply by spatial_iterations too because memory is replicated spatially
level_mops = level.MOPs()
in_reads, w_reads, out_reads, out_writes = map(lambda m : m*temporal_iterations*spatial_iterations, level_mops[:4])
out_reads_factors = level_mops[4]*acc_out_reads_factors
if not bias_read and out_reads_factors != 0:
out_reads = (out_reads*(out_reads_factors - 1))//out_reads_factors
if 'in' not in level.bypasses:
in_writes = last_in_reads # reads above are written here
last_in_reads = in_reads
else:
in_writes = 0
if 'w' not in level.bypasses:
w_writes = last_w_reads # reads above are written here
last_w_reads = w_reads
else:
w_writes = 0
if 'out' not in level.bypasses:
level.setAboveMOPs(last_out_reads, last_out_writes)
out_writes += last_out_reads # reads above are written here
last_out_reads = out_reads
if not Settings.FREE_DRAINS:
out_reads += last_out_writes # writes above where read here
last_out_writes = out_writes
else:
level.setAboveMOPs(0, 0)
level.setMOPs(in_reads = in_reads, w_reads = w_reads, out_reads = out_reads, in_writes = in_writes, w_writes = w_writes, out_writes = out_writes)
level.temporal_iterations = temporal_iterations
reads = in_reads + w_reads + out_reads
writes = in_writes + w_writes + out_writes
level.active_instances = spatial_iterations
WMOPs += level.WMOPs(reads, writes)
temporal_iterations *= level.factors.fullProduct()
acc_out_reads_factors *= math.prod(level.factors.dimProduct(dim) for dim in level.dataflow if dim not in level.arch.coupling.flat_out_coupling)
elif isinstance(level, FanoutLevel):
spatial_iterations *= level.factors.fullProduct()
# spatial reuse of an operand occurs if the fanout is along a dimension not coupled to such operand,
# hence, the operand is read once, but written once per instance (modeled by last_XX_reads)
# TODO: add NoC modeling and accumulate data transfer energy here!
for dim in level.dataflow:
iterations = level.factors.dimProduct(dim)
if dim not in arch.coupling.flat_in_coupling:
last_in_reads *= iterations
if dim not in arch.coupling.flat_w_coupling:
last_w_reads *= iterations
if dim not in arch.coupling.flat_out_coupling:
last_out_reads *= iterations
last_out_writes *= iterations
elif isinstance(level, ComputeLevel):
# TODO: remove cost of first output accumulate if bias_read is False!
# => not needed because the cost of the add is << than the multiply!
level.temporal_iterations = temporal_iterations
level.active_instances = spatial_iterations
WMOPs += level.computeCost(temporal_iterations*spatial_iterations)
# compute is meant to be the innermost level
break
# Latency:
max_latency = 0
cc_per_tile = arch[len(arch)-1].latency()
# IMPROVEMENT:
# - data for the operand which is stationary can be drained/filled immediately as all iterations unfold (cc_per_tile * fullProduct)
# - data for the non-stationary operands can be filled/drained only during the last iteration of the outermost loop (cc_per_tile * product of only the two inner dimensions)
# - if double buffering, add the outermost dimension to the second product too
# WARNING:
# - the stationary operand only ever occupies a PART of its allotted space, so the size constraint shall be relaxed according to the number of instance
# of such operand which are scheduled to be kept at the same time
for i in range(len(arch) - 2, -1, -1):
level = arch[i]
previous_fanout_pe_to_pe_warmup = 0
if isinstance(level, MemLevel):
scaling = level.active_instances*level.temporal_iterations
cc_per_all_tiles = cc_per_tile*level.factors.fullProduct()
scaled_cc_per_all_tiles = scaling*cc_per_all_tiles
ideal_bandwidth_read = level.getRead()/scaled_cc_per_all_tiles # original -more readable- formulation: (level.getRead()/scaling)/(cc_per_tile*level.factors.fullProduct())
ideal_bandwidth_update = level.getUpdate()/scaled_cc_per_all_tiles
# TODO: this should get divided per-operand, as depending on the dataflow, an operand may have more or less iterations to be loaded
# TODO: support double buffering on a per-operand basis
# NOTE: the current implementation coincides with Timeloop's notion of Buffets, but it is not exact...
# NOTE: my bandwidth is already a bit more accurate since Timeloop ignores drains...
#outermost_available_iterations = 1 if level.multiple_buffering == 1 else level.factors.dimProduct(level.dataflow[0])*(level.multiple_buffering - 1)
#ideal_bandwidth_fill = (level.getFill()/scaling)/(cc_per_tile*level.factors.dimProduct(level.dataflow[1])*level.factors.dimProduct(level.dataflow[2])*outermost_available_iterations)
#ideal_bandwidth_drain = (level.getDrain()/scaling)/(cc_per_tile*level.factors.dimProduct(level.dataflow[1])*level.factors.dimProduct(level.dataflow[2])*outermost_available_iterations)
ideal_bandwidth_fill = level.getFill()/scaled_cc_per_all_tiles
ideal_bandwidth_drain = level.getDrain()/scaled_cc_per_all_tiles if not Settings.FREE_DRAINS else 0
# bandwidth is statically divided between reads and writes
# NOTE: warmup cycles cannot be used to compensate for a lack of bandiwidth at regime
if ideal_bandwidth_read + ideal_bandwidth_drain <= level.read_bandwidth:
latency_read_drain = cc_per_all_tiles + previous_fanout_pe_to_pe_warmup*cc_per_tile
else:
latency_read_drain = ((level.getRead() + level.getDrain())/scaling)*(1/level.read_bandwidth) if not Settings.FREE_DRAINS else (level.getRead()/scaling)*(1/level.read_bandwidth)
if ideal_bandwidth_fill + ideal_bandwidth_update <= level.write_bandwidth:
latency_fill_update = cc_per_all_tiles + previous_fanout_pe_to_pe_warmup*cc_per_tile
else:
latency_fill_update = ((level.getFill() + level.getUpdate())/scaling)*(1/level.write_bandwidth)
latency = max(latency_read_drain, latency_fill_update)
stall_cycles = latency - cc_per_all_tiles
level.setLatency(latency_read_drain = latency_read_drain*level.temporal_iterations, latency_fill_update = latency_fill_update*level.temporal_iterations, cc_per_tile = cc_per_tile, stall_cycles = stall_cycles*level.temporal_iterations, ideal_bandwidth_read = ideal_bandwidth_read, ideal_bandwidth_update = ideal_bandwidth_update, ideal_bandwidth_fill = ideal_bandwidth_fill, ideal_bandwidth_drain = ideal_bandwidth_drain)
#Timeloop does this (loosing knowledge of true behaviour): cc_per_tile = cc_per_tile*level.factors.fullProduct()
previous_fanout_pe_to_pe_warmup = 0
cc_per_tile = latency
elif isinstance(level, FanoutLevel) and level.pe_to_pe:
# pe-to-pe forwarding implies that the SA operates as a PIPELINE, which has overall latency equal to that of an operation but a warmup dependent on the mesh size
previous_fanout_pe_to_pe_warmup = level.mesh - 1
# Active instances, leakage, and final latency:
temporal_iterations = 1
powered_instances = 1
for i in range(len(arch)):
level = arch[i]
if isinstance(level, MemLevel):
max_latency = max(max_latency, level.getSettedLatency())
#print(f"Leakage level {level.name}: {level.Leakage(level.getSettedLatency())*powered_instances}")
WMOPs += level.Leakage(level.getSettedLatency())*powered_instances
temporal_iterations *= level.factors.fullProduct()
elif isinstance(level, FanoutLevel):
max_latency = max(max_latency, level.latency()*temporal_iterations)
if level.power_gating_support:
powered_instances *= level.factors.fullProduct()
else:
powered_instances *= level.mesh
elif isinstance(level, ComputeLevel):
max_latency = max(max_latency, level.latency()*temporal_iterations)
WMOPs += level.Leakage(level.latency())*powered_instances
break
return WMOPs, max_latency
"""
Weighted Arithmetic Intensity (WART)
It is equivalent to FLOPs/EDPoU, where EDPoU = (Energy * Latency) / Utilization
=> Maximizing the WART minimizes the EDPoU.
"""
def Wart(arch : Arch, comp : Shape, bias_read : bool, utilization_exponent : int = 1) -> float:
FLOPs = comp.FLOPs()
WMOPs, max_latency = updateStats(arch, bias_read)
utilization = arch.spatialUtilization()**utilization_exponent if Settings.UTILIZATION_IN_WART else 1
return (FLOPs/(WMOPs*max_latency))*utilization
"""
Energy-Delay Product [pJ*cc]
If pJ_to_J is True, the returned value is in [J*cc].
"""
def EDP(arch : Arch, bias_read : bool, pJ_to_J : bool = False) -> float:
WMOPs, max_latency = updateStats(arch, bias_read)
return WMOPs*max_latency*(10**-12 if pJ_to_J else 1)
"""
Latency [cc]
"""
def Latency(arch : Arch) -> int:
max_latency = 0
for level in arch:
if isinstance(level, MemLevel):
if max_latency <= level.getSettedLatency():
max_latency = level.getSettedLatency()
elif isinstance(level, FanoutLevel):
continue
elif isinstance(level, ComputeLevel):
break
return max_latency
"""
Energy [pJ]
If pJ_to_uJ is True, the returned value is in [uJ].
"""
def Energy(arch : Arch, pJ_to_uJ : bool = False) -> float:
WMOPs = 0
for level in arch:
if isinstance(level, MemLevel):
reads = level.in_reads + level.w_reads + level.out_reads
writes = level.in_writes + level.w_writes + level.out_writes
WMOPs += level.WMOPs(reads, writes)
elif isinstance(level, FanoutLevel):
continue
elif isinstance(level, ComputeLevel):
WMOPs += level.computeCost(level.temporal_iterations*level.active_instances)
break
return WMOPs*(10**-6 if pJ_to_uJ else 1)
"""
Total read and write Memory Operations (MOPs)
"""
def MOPs(arch : Arch) -> tuple[int, int]:
tot_reads, tot_writes = 0, 0
for level in arch:
if isinstance(level, MemLevel):
reads = level.in_reads + level.w_reads + level.out_reads
writes = level.in_writes + level.w_writes + level.out_writes
tot_reads += reads
tot_writes += writes
elif isinstance(level, FanoutLevel):
continue
elif isinstance(level, ComputeLevel):
break
return tot_reads, tot_writes