Skip to content

Commit 297e4ee

Browse files
author
George Rokos
committed
[OpenMP] Initial implementation of OpenMP offloading library - libomptarget device RTLs.
This patch implements the device runtime library whose interface is used in the code generation for OpenMP offloading devices. Currently there is a single device RTL written in CUDA meant to CUDA enabled GPUs. The interface is a variation of the kmpc interface that includes some extra calls to do thread and storage management that only make sense for a GPU target. Differential revision: https://reviews.llvm.org/D14254 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@323649 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent d991dbe commit 297e4ee

27 files changed

+5897
-1
lines changed

README.rst

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ Options for ``libomp``
166166
Create the Fortran modules (requires Fortran compiler).
167167

168168
macOS* Fat Libraries
169-
""""""""""""""""""
169+
""""""""""""""""""""
170170
On macOS* machines, it is possible to build universal (or fat) libraries which
171171
include both i386 and x86_64 architecture objects in a single archive.
172172

@@ -254,6 +254,40 @@ Options for ``libomptarget``
254254
Path of the folder that contains ``libomp.so``. This is required for testing
255255
out-of-tree builds.
256256

257+
Options for ``NVPTX device RTL``
258+
--------------------------------
259+
260+
**LIBOMPTARGET_NVPTX_ENABLE_BCLIB** = ``OFF|ON``
261+
Enable CUDA LLVM bitcode offloading device RTL. This is used for link time
262+
optimization of the OMP runtime and application code.
263+
264+
**LIBOMPTARGET_NVPTX_CUDA_COMPILER** = ``""``
265+
Location of a CUDA compiler capable of emitting LLVM bitcode. Currently only
266+
the Clang compiler is supported. This is only used when building the CUDA LLVM
267+
bitcode offloading device RTL. If unspecified and the CMake C compiler is
268+
Clang, then Clang is used.
269+
270+
**LIBOMPTARGET_NVPTX_BC_LINKER** = ``""``
271+
Location of a linker capable of linking LLVM bitcode objects. This is only
272+
used when building the CUDA LLVM bitcode offloading device RTL. If unspecified
273+
and the CMake C compiler is Clang and there exists a llvm-link binary in the
274+
directory containing Clang, then this llvm-link binary is used.
275+
276+
**LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER** = ``""``
277+
Host compiler to use with NVCC. This compiler is not going to be used to
278+
produce any binary. Instead, this is used to overcome the input compiler
279+
checks done by NVCC. E.g. if using a default host compiler that is not
280+
compatible with NVCC, this option can be use to pass to NVCC a valid compiler
281+
to avoid the error.
282+
283+
**LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY** = ``35``
284+
CUDA compute capability that should be supported by the NVPTX device RTL. E.g.
285+
for compute capability 6.0, the option "60" should be used. Compute capability
286+
3.5 is the minimum required.
287+
288+
**LIBOMPTARGET_NVPTX_DEBUG** = ``OFF|ON``
289+
Enable printing of debug messages from the NVPTX device RTL.
290+
257291
Example Usages of CMake
258292
=======================
259293

libomptarget/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ endif()
6767

6868
# Build offloading plugins and device RTLs if they are available.
6969
add_subdirectory(plugins)
70+
add_subdirectory(deviceRTLs)
7071

7172
# Add tests.
7273
add_subdirectory(test)
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
##===----------------------------------------------------------------------===##
2+
#
3+
# The LLVM Compiler Infrastructure
4+
#
5+
# This file is dual licensed under the MIT and the University of Illinois Open
6+
# Source Licenses. See LICENSE.txt for details.
7+
#
8+
# ##===----------------------------------------------------------------------===##
9+
#
10+
# Build a device RTL for each available machine available.
11+
#
12+
##===----------------------------------------------------------------------===##
13+
14+
add_subdirectory(nvptx)
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
##===----------------------------------------------------------------------===##
2+
#
3+
# The LLVM Compiler Infrastructure
4+
#
5+
# This file is dual licensed under the MIT and the University of Illinois Open
6+
# Source Licenses. See LICENSE.txt for details.
7+
#
8+
##===----------------------------------------------------------------------===##
9+
#
10+
# Build the NVPTX (CUDA) Device RTL if the CUDA tools are available
11+
#
12+
##===----------------------------------------------------------------------===##
13+
14+
set(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER "" CACHE STRING
15+
"Path to alternate NVCC host compiler to be used by the NVPTX device RTL.")
16+
17+
if(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER)
18+
find_program(ALTERNATE_CUDA_HOST_COMPILER NAMES ${LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER})
19+
if(NOT ALTERNATE_CUDA_HOST_COMPILER)
20+
libomptarget_say("Not building CUDA offloading device RTL: invalid NVPTX alternate host compiler.")
21+
endif()
22+
set(CUDA_HOST_COMPILER ${ALTERNATE_CUDA_HOST_COMPILER} CACHE FILEPATH "" FORCE)
23+
endif()
24+
25+
# We can't use clang as nvcc host preprocessor, so we attempt to replace it with
26+
# gcc.
27+
if(CUDA_HOST_COMPILER MATCHES clang)
28+
29+
find_program(LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER NAMES gcc)
30+
31+
if(NOT LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER)
32+
libomptarget_say("Not building CUDA offloading device RTL: clang is not supported as NVCC host compiler.")
33+
libomptarget_say("Please include gcc in your path or set LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER to the full path of of valid compiler.")
34+
return()
35+
endif()
36+
set(CUDA_HOST_COMPILER "${LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER}" CACHE FILEPATH "" FORCE)
37+
endif()
38+
39+
if(LIBOMPTARGET_DEP_CUDA_FOUND)
40+
libomptarget_say("Building CUDA offloading device RTL.")
41+
42+
# We really don't have any host code, so we don't need to care about
43+
# propagating host flags.
44+
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
45+
46+
set(cuda_src_files
47+
src/cancel.cu
48+
src/critical.cu
49+
src/data_sharing.cu
50+
src/libcall.cu
51+
src/loop.cu
52+
src/omptarget-nvptx.cu
53+
src/parallel.cu
54+
src/reduction.cu
55+
src/sync.cu
56+
src/task.cu
57+
)
58+
59+
set(omp_data_objects src/omp_data.cu)
60+
61+
# Get the compute capability the user requested or use SM_35 by default.
62+
# SM_35 is what clang uses by default.
63+
set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY 35 CACHE STRING
64+
"CUDA Compute Capability to be used to compile the NVPTX device RTL.")
65+
set(CUDA_ARCH -arch sm_${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY})
66+
67+
# Activate RTL message dumps if requested by the user.
68+
set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
69+
"Activate NVPTX device RTL debug messages.")
70+
if(${LIBOMPTARGET_NVPTX_DEBUG})
71+
set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g --ptxas-options=-v)
72+
endif()
73+
74+
# NVPTX runtime library has to be statically linked. Dynamic linking is not
75+
# yet supported by the CUDA toolchain on the device.
76+
set(BUILD_SHARED_LIBS OFF)
77+
set(CUDA_SEPARABLE_COMPILATION ON)
78+
79+
cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
80+
OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
81+
82+
# Install device RTL under the lib destination folder.
83+
install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "lib")
84+
85+
target_link_libraries(omptarget-nvptx ${CUDA_LIBRARIES})
86+
87+
# Check if we can create an LLVM bitcode implementation of the runtime library
88+
# that could be inlined in the user implementation.
89+
set(LIBOMPTARGET_NVPTX_ENABLE_BCLIB FALSE CACHE BOOL
90+
"Enable CUDA LLVM bitcode offloading device RTL.")
91+
if (${LIBOMPTARGET_NVPTX_ENABLE_BCLIB})
92+
93+
# Find a clang compiler capable of compiling cuda files to LLVM bitcode and
94+
# an LLVM linker.
95+
# We use the one provided by the user, attempt to use the one used to build
96+
# libomptarget or just fail.
97+
98+
set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
99+
"Location of a CUDA compiler capable of emitting LLVM bitcode.")
100+
set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
101+
"Location of a linker capable of linking LLVM bitcode objects.")
102+
103+
if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
104+
set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
105+
elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
106+
set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER})
107+
else()
108+
libomptarget_error_say("Cannot find a CUDA compiler capable of emitting LLVM bitcode.")
109+
libomptarget_error_say("Please configure with flag -DLIBOMPTARGET_NVPTX_CUDA_COMPILER")
110+
endif()
111+
112+
# Get compiler directory to try to locate a suitable linker
113+
get_filename_component(COMPILER_DIR ${CMAKE_C_COMPILER} DIRECTORY)
114+
115+
if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
116+
set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER})
117+
elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND EXISTS "${COMPILER_DIR}/llvm-link")
118+
# Use llvm-link from the directory containing clang
119+
set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${COMPILER_DIR}/llvm-link)
120+
else()
121+
libomptarget_error_say("Cannot find a linker capable of linking LLVM bitcode objects.")
122+
libomptarget_error_say("Please configure with flag -DLIBOMPTARGET_NVPTX_BC_LINKER")
123+
endif()
124+
125+
if(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER AND LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER)
126+
libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.")
127+
128+
# Decide which ptx version to use. Same choices as Clang.
129+
if(CUDA_VERSION_MAJOR GREATER 9 OR CUDA_VERSION_MAJOR EQUAL 9)
130+
set(CUDA_PTX_VERSION ptx60)
131+
else()
132+
set(CUDA_PTX_VERSION ptx42)
133+
endif()
134+
135+
# Set flags for Clang cuda compilation. Only Clang is supported because there is
136+
# no other compiler capable of generating bitcode from cuda sources.
137+
set(CUDA_FLAGS
138+
-emit-llvm
139+
-O1
140+
-Xclang -target-feature
141+
-Xclang +${CUDA_PTX_VERSION}
142+
--cuda-device-only
143+
-DOMPTARGET_NVPTX_TEST=0 -DOMPTARGET_NVPTX_DEBUG=0
144+
)
145+
146+
# CUDA 9 header files use the nv_weak attribute which clang is not yet prepared
147+
# to handle. Therefore, we use 'weak' instead. We are compiling only for the
148+
# device, so it should be equivalent.
149+
if(CUDA_VERSION_MAJOR EQUAL 9)
150+
set(CUDA_FLAGS ${CUDA_FLAGS} -Dnv_weak=weak)
151+
endif()
152+
153+
# Get the compute capability the user requested or use SM_35 by default.
154+
set(CUDA_ARCH "")
155+
set(CUDA_ARCH --cuda-gpu-arch=sm_${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY})
156+
157+
# Compile cuda files to bitcode.
158+
set(bc_files "")
159+
foreach(src ${cuda_src_files})
160+
get_filename_component(infile ${src} ABSOLUTE)
161+
get_filename_component(outfile ${src} NAME)
162+
163+
add_custom_command(OUTPUT ${outfile}.bc
164+
COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${CUDA_FLAGS} ${CUDA_ARCH} ${CUDA_INCLUDES}
165+
-c ${infile} -o ${outfile}.bc
166+
DEPENDS ${infile}
167+
IMPLICIT_DEPENDS CXX ${infile}
168+
COMMENT "Building LLVM bitcode ${outfile}.bc"
169+
VERBATIM
170+
)
171+
set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}.bc)
172+
173+
list(APPEND bc_files ${outfile}.bc)
174+
endforeach()
175+
176+
# Link to a bitcode library.
177+
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc
178+
COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
179+
-o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc ${bc_files}
180+
DEPENDS ${bc_files}
181+
COMMENT "Linking LLVM bitcode libomptarget-nvptx.bc"
182+
)
183+
set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx.bc)
184+
185+
add_custom_target(omptarget-nvptx-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc)
186+
187+
# Copy library to destination.
188+
add_custom_command(TARGET omptarget-nvptx-bc POST_BUILD
189+
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc
190+
$<TARGET_FILE_DIR:omptarget-nvptx>)
191+
192+
# Install device RTL under the lib destination folder.
193+
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc DESTINATION "lib")
194+
195+
endif()
196+
endif()
197+
198+
else()
199+
libomptarget_say("Not building CUDA offloading device RTL: CUDA tools not found in the system.")
200+
endif()

0 commit comments

Comments
 (0)