Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions benchmarks/opencl/ma/compact-features.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// clang -x cl -cl-std=CL2.0 -target spir-unknown-unknown -cl-opt-disable \
// -finline-functions -finline-hint-functions \
// -emit-llvm -fno-discard-value-names -c compact-features.cl
// -o compact-features.bc -DNUM_VAR=1 -DNUM_OUTPUT_VAR=1
// llvm-spirv compact-features.bc -o compact-features.spv
// spirv-dis compact-features.spv > compact-features.spvasm

#ifdef DV2WG
#define scope memory_scope_work_group
#else
#define scope memory_scope_device
#endif

#ifdef LC2GB
#define bar_flag CLK_GLOBAL_MEM_FENCE
#else
#define bar_flag CLK_LOCAL_MEM_FENCE
#endif

__kernel void compact_features(__global uint* flags,
__global uint* out_indices,
__global uint* group_offset) {
__local uint s_idx;

uint tid = get_local_id(0);
uint gid = get_global_id(0);
uint group_id = get_group_id(0);

// The work-group leader initializes the index using a plain store.
if (tid == 0) {
s_idx = group_offset[group_id];
}

barrier(bar_flag);

// Threads filter data and contend for slots in the output list.
if (flags[gid]) {
uint dst = atomic_fetch_add_explicit((atomic_uint*)&s_idx, 1, memory_order_relaxed, scope);
out_indices[dst] = gid;
}
}
33 changes: 33 additions & 0 deletions benchmarks/opencl/ma/histogram-implicit.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// clang -x cl -cl-std=CL2.0 -target spir-unknown-unknown -cl-opt-disable \
// -finline-functions -finline-hint-functions \
// -emit-llvm -fno-discard-value-names -c histogram.cl
// -o histogram.bc -DNUM_VAR=1 -DNUM_OUTPUT_VAR=1
// llvm-spirv histogram.bc -o histogram.spv
// spirv-dis histogram.spv > histogram.spvasm

#define HIST_BINS 2

__kernel void histo_main_kernel(global uint *sm_mappings,
global uint *global_histo)
{
__local uint sub_histo[HIST_BINS];

int tid = get_local_id(0);
int gid = get_global_id(0);

// Safe plain store because threads own distinct indices;
sub_histo[tid] = 0;

barrier(CLK_LOCAL_MEM_FENCE);

// Multiple threads contend for the same bins;
uint bin_index = sm_mappings[gid];
atom_add(sub_histo + bin_index, 1);

barrier(CLK_LOCAL_MEM_FENCE);

// Read local result plain to flush to global;
uint count = sub_histo[tid];
if (count > 0)
atom_add(global_histo + tid, count);
}
45 changes: 45 additions & 0 deletions benchmarks/opencl/ma/histogram.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// clang -x cl -cl-std=CL2.0 -target spir-unknown-unknown -cl-opt-disable \
// -finline-functions -finline-hint-functions \
// -emit-llvm -fno-discard-value-names -c histogram.cl
// -o histogram.bc -DNUM_VAR=1 -DNUM_OUTPUT_VAR=1
// llvm-spirv histogram.bc -o histogram.spv
// spirv-dis histogram.spv > histogram.spvasm

#ifdef DV2WG
#define scope memory_scope_work_group
#else
#define scope memory_scope_device
#endif

#ifdef LC2GB
#define flag CLK_GLOBAL_MEM_FENCE
#else
#define flag CLK_LOCAL_MEM_FENCE
#endif

#define HIST_BINS 2

__kernel void histo_main_kernel(__global uint *sm_mappings,
__global uint *global_histo)
{
__local uint sub_histo[HIST_BINS];

int tid = get_local_id(0);
int gid = get_global_id(0);

// Safe plain store because threads own distinct indices;
sub_histo[tid] = 0;

barrier(CLK_LOCAL_MEM_FENCE);

// Multiple threads contend for the same bins;
uint bin_index = sm_mappings[gid];
atomic_fetch_add_explicit((atomic_uint*)&sub_histo[bin_index], 1, memory_order_relaxed, memory_scope_work_group);

barrier(CLK_LOCAL_MEM_FENCE);

// Read local result plain to flush to global;
uint count = sub_histo[tid];
if (count > 0)
atomic_fetch_add_explicit((atomic_uint*)&global_histo[tid], count, memory_order_relaxed, scope);
}
33 changes: 33 additions & 0 deletions benchmarks/opencl/mixed-atomicity/histogram.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#ifdef DV2WG
#define scope memory_scope_work_group
#else
#define scope memory_scope_device
#endif

#ifdef LC2GB
#define flag CLK_GLOBAL_MEM_FENCE
#else
#define flag CLK_LOCAL_MEM_FENCE
#endif

__kernel void histo_main_kernel(__global uint *sm_mappings, __global uint *global_histo)
{
__local uint sub_histo[2];

int tid = get_local_id(0);
int gid = get_global_id(0);

sub_histo[tid] = 0;

barrier(flag);

uint bin_index = sm_mappings[gid];

atomic_fetch_add_explicit((atomic_uint*)&sub_histo[bin_index], 1, memory_order_relaxed, memory_scope_work_group);

barrier(flag);

uint count = sub_histo[tid];
if (count > 0)
atomic_fetch_add_explicit((atomic_uint*)&global_histo[tid], count, memory_order_relaxed, scope);
}
129 changes: 129 additions & 0 deletions cat/opencl-ma.cat
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
OpenCL
(* OpenCL Memory Model *)

(*
* This model is based on:
* https://multicore.doc.ic.ac.uk/overhauling/opencl_base.cat
* https://multicore.doc.ic.ac.uk/overhauling/opencl_scopedsc.cat
*)

// Base relations:
// wi: same work-item (same thread)
// swg: same work-group
// sdv: same device
// syncbar: same barrier id

// Tags:
// WI: work-item scope
// WG: work-group scope
// DV: device scope
// ALL: all-svm-devices scope
// GLOBAL: global memory
// LOCAL: local memory

// dynamic_tag relates events to itself that access an address whose init event is marked X or Fence tagged with X
let dynamic_tag(X) = [range([IW & X]; loc)] | [X & F]

let mo = co \ (NA * NA)
let sb = po
let rb = rf^-1;mo | ([R] \ [range(rf)]);loc;[W]
let unv = _ * _
let wi = int

(* Inclusive scopes *)
let incl = (swg & (WG * WG)) | (sdv & (DV * DV)) | (ALL * ALL)

(*******************)
(* Synchronisation *)
(*******************)

let Acq = (ACQ | SC | ACQ_REL) & (R | F)
let Rel = (REL | SC | ACQ_REL) & (W | F)

(* Fences sequenced before or after *)
let Fsb = [F]; sb
let sbF = sb; [F]

(* Release sequence *)
let rs_prime = (_ * RMW) | wi
let rs = mo & rs_prime & ~((mo & ~rs_prime) ; mo)

(* Release-acquire synchronisation *)
let ra_sw(r) = ((r & [Rel]); Fsb?; [W & A]; rs?; r; rf; [R & A]; sbF?; ([Acq] & r)) & incl & ~wi

(* Barrier synchronisation *)
// in OpenCL a barrier results in two fence operations: entry and exit fences:
// https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_API.html#_work_group_functions
// In our implementation, we use a single barrier event and omit the special EF and XF tags
let bar_sw(r) = r; syncbar & ~wi & swg; r

(* Allowed to synchronise on the other region *)
let scf = (SC * SC) | ((dynamic_tag(GLOBAL) & dynamic_tag(LOCAL)); unv; (dynamic_tag(GLOBAL) & dynamic_tag(LOCAL)))

(* Global and local synchronises-with *)
let gsw = ra_sw(dynamic_tag(GLOBAL)) | bar_sw(dynamic_tag(GLOBAL)) | (scf & ra_sw(dynamic_tag(LOCAL)))
let lsw = ra_sw(dynamic_tag(LOCAL)) | bar_sw(dynamic_tag(LOCAL)) | (scf & ra_sw(dynamic_tag(GLOBAL)))

(******************)
(* Happens-before *)
(******************)

(* Global and local happens-before *)
// Since we use single barrier events, we exclude the identity relation from the happens-before relation
let ini = IW * ~IW
let ghb = ((dynamic_tag(GLOBAL); sb; dynamic_tag(GLOBAL)) | (dynamic_tag(GLOBAL); ini; dynamic_tag(GLOBAL)) | gsw)+ \ id
let lhb = ((dynamic_tag(LOCAL); sb; dynamic_tag(LOCAL)) | (dynamic_tag(LOCAL); ini; dynamic_tag(LOCAL)) | lsw)+ \ id

irreflexive ghb as O-HbG
irreflexive lhb as O-HbL

(*************)
(* Coherence *)
(*************)

let coh(hb) = (rf^-1)?; mo; rf?; hb
irreflexive coh(ghb) as O-CohG
irreflexive coh(lhb) as O-CohL

(************************)
(* Consistency of reads *)
(************************)

(* A load can only read from a store that already happened. *)
irreflexive rf; (ghb | lhb) as O-Rf

(* Visible side effects *)
let vis(hb) = (W * R) & hb & loc & ~((hb & loc); [W]; hb)

(* A non-atomic load can only read from a store that is visible. *)
empty (rf; ([NA] & dynamic_tag(GLOBAL))) \ vis(ghb) as O-NaRfG
empty (rf; ([NA] & dynamic_tag(LOCAL))) \ vis(lhb) as O-NaRfL

(* Consistency of RMWs *)
// The original model was tested with Herd, which treats RMW as a single atomic operation.
// We have modified the model to handle RMW as a sequence of atomic operations.
// irreflexive rf | (mo;mo;rf^-1) | (mo;rf) as O-Rmw
empty rmw & (fre; coe) as 0-Atomic

(****************************************)
(* Sequential consistency, simplified, *)
(* with scoped SC axioms *)
(****************************************)

let scp = Fsb?; (rb | mo | (ghb | lhb)); sbF?
acyclic (SC*SC) & scp & incl as O-Sscoped

(***************)
(* Races *)
(***************)

(* data_races *)
let cnf = ((W * W) | (W * R) | (R * W)) & loc
// TODO: "there is exactly one initial event per location", in current implementation, memory object like global scope id is initialized by three write events
let dr = cnf & ~(ghb | lhb) & ~(ghb | lhb)^-1 & ~wi & ~incl \ ((_ * IW) | (IW * _))
flag ~empty dr as data_race

(* unsequenced_races *)
let symm(r) = r | r^-1
let ur = (wi & cnf & ~symm(sb)) \ id
flag ~empty ur as unsequenced_race
2 changes: 1 addition & 1 deletion cat/opencl.cat
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ let rs_prime = (_ * RMW) | wi
let rs = mo & rs_prime & ~((mo & ~rs_prime) ; mo)

(* Release-acquire synchronisation *)
let ra_sw(r) = ((r & [Rel]); Fsb?; [W \ WI]; rs?; r; rf; [R \ WI]; sbF?; ([Acq] & r)) & incl & ~wi
let ra_sw(r) = ((r & [Rel]); Fsb?; [W & A]; rs?; r; rf; [R & A]; sbF?; ([Acq] & r)) & incl & ~wi

(* Barrier synchronisation *)
// in OpenCL a barrier results in two fence operations: entry and exit fences:
Expand Down
2 changes: 2 additions & 0 deletions dartagnan/src/main/antlr4/Spirv.g4
Original file line number Diff line number Diff line change
Expand Up @@ -1741,6 +1741,7 @@ clspvReflection
| clspvReflection_specConstantSubgroupMaxSize
| clspvReflection_specConstantWorkDim
| clspvReflection_specConstantWorkgroupSize
| clspvReflection_workgroupVariableSize
;

clspvReflection_kernel : ModeExt_Kernel kernelIdRef nameIdRef (numArguments (flags attributes?)?)?;
Expand All @@ -1755,6 +1756,7 @@ clspvReflection_argumentStorageImage : ModeExt_ArgumentStorageImage decl ordinal
clspvReflection_argumentSampler : ModeExt_ArgumentSampler decl ordinal descriptorSetIdRef binding argInfo?;
clspvReflection_argumentWorkgroup : ModeExt_ArgumentWorkgroup decl ordinal specId elemSize argInfo?;
clspvReflection_specConstantWorkgroupSize : ModeExt_SpecConstantWorkgroupSize x y z;
clspvReflection_workgroupVariableSize : ModeExt_WorkgroupVariableSize x y;
clspvReflection_specConstantGlobalOffset : ModeExt_SpecConstantGlobalOffset x y z;
clspvReflection_specConstantWorkDim : ModeExt_SpecConstantWorkDim dimIdRef;
clspvReflection_pushConstantGlobalOffset : ModeExt_PushConstantGlobalOffset offsetIdRef sizeIdRef;
Expand Down
1 change: 1 addition & 0 deletions dartagnan/src/main/antlr4/SpirvLexer.g4
Original file line number Diff line number Diff line change
Expand Up @@ -1836,6 +1836,7 @@ ModeExt_PrintfInfo : 'PrintfInfo';
ModeExt_PrintfBufferStorageBuffer : 'PrintfBufferStorageBuffer';
ModeExt_PrintfBufferPointerPushConstant : 'PrintfBufferPointerPushConstant';
ModeExt_NormalizedSamplerMaskPushConstant : 'NormalizedSamplerMaskPushConstant';
ModeExt_WorkgroupVariableSize : 'WorkgroupVariableSize';

ModeExt_Round : 'Round';
ModeExt_RoundEven : 'RoundEven';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ public VisitorOpsMemory(ProgramBuilder builder) {
@Override
public Event visitOpStore(SpirvParser.OpStoreContext ctx) {
Expression pointer = builder.getExpression(ctx.pointer().getText());
pointer.getMemoryObjects().forEach(mo -> mo.addFeatureTag(Tag.C11.NON_ATOMIC_LOCATION));
String valueId = ctx.object().getText();
Expression value = builder.getExpression(valueId);
Type type = value.getType();
Expand All @@ -63,6 +64,7 @@ public Event visitOpStore(SpirvParser.OpStoreContext ctx) {
public Event visitOpLoad(SpirvParser.OpLoadContext ctx) {
String resultId = ctx.idResult().getText();
Expression pointer = builder.getExpression(ctx.pointer().getText());
pointer.getMemoryObjects().forEach(mo -> mo.addFeatureTag(Tag.C11.NON_ATOMIC_LOCATION));
Type type = builder.getType(ctx.idResultType().getText());
List<Event> events = visitMemoryAccess(resultId, type, pointer, (i, exp) -> {
String regId = resultId;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ public Expression visitClspvReflection_specConstantWorkgroupSize(SpirvParser.Cls
return null;
}

@Override
public Expression visitClspvReflection_workgroupVariableSize(SpirvParser.ClspvReflection_workgroupVariableSizeContext ctx) {
// Do nothing, will be overwritten by BuiltIn WorkgroupSize
return null;
}

@Override
public Expression visitClspvReflection_pushConstantGlobalOffset(SpirvParser.ClspvReflection_pushConstantGlobalOffsetContext ctx) {
return setPushConstantValue("PushConstantGlobalOffset", ctx.offsetIdRef().getText(), ctx.sizeIdRef().getText());
Expand Down Expand Up @@ -195,7 +201,8 @@ public Set<String> getSupportedInstructions() {
"PushConstantNumWorkgroups",
"PushConstantRegionOffset",
"PushConstantRegionGroupOffset",
"SpecConstantWorkgroupSize"
"SpecConstantWorkgroupSize",
"WorkgroupVariableSize"
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,7 @@ public static final class OpenCL {
public static final String GLOBAL_SPACE = "GLOBAL";
public static final String LOCAL_SPACE = "LOCAL";
public static final String GENERIC_SPACE = "GENERIC";
public static final String PRIVATE = "PRIVATE";
// Default Tags
public static final String DEFAULT_SPACE = GENERIC_SPACE;
public static final String DEFAULT_SCOPE = DEVICE;
Expand Down Expand Up @@ -563,16 +564,16 @@ public static String toOpenCLTag(String tag) {
// Storage class
case SC_GENERIC -> OpenCL.GENERIC_SPACE;
case SC_FUNCTION,
SC_INPUT,
SC_PRIVATE -> OpenCL.PRIVATE;
case SC_INPUT,
SC_WORKGROUP -> OpenCL.LOCAL_SPACE;
case SC_UNIFORM_CONSTANT,
SC_PHYS_STORAGE_BUFFER,
SC_CROSS_WORKGROUP -> OpenCL.GLOBAL_SPACE;
case SC_PUSH_CONSTANT,
SC_UNIFORM,
SC_OUTPUT,
SC_STORAGE_BUFFER,
SC_PRIVATE -> throw new UnsupportedOperationException(
SC_STORAGE_BUFFER -> throw new UnsupportedOperationException(
getErrorMsg(model, "storage class", tag));

default -> throw new IllegalArgumentException(
Expand Down
Loading