diff --git a/Exercises/graph/Begin/CMakeLists.txt b/Exercises/graph/Begin/CMakeLists.txt new file mode 100644 index 00000000..2eb44af3 --- /dev/null +++ b/Exercises/graph/Begin/CMakeLists.txt @@ -0,0 +1,6 @@ +cmake_minimum_required(VERSION 3.22) +project(KokkosTutorialExecGraph) +include(../../common.cmake) + +add_executable(graph graph_begin.cpp) +target_link_libraries(graph Kokkos::kokkos) diff --git a/Exercises/graph/Begin/graph_begin.cpp b/Exercises/graph/Begin/graph_begin.cpp new file mode 100644 index 00000000..4ad9ee29 --- /dev/null +++ b/Exercises/graph/Begin/graph_begin.cpp @@ -0,0 +1,186 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include +#include +#include + +#include + +#include +// EXERCISE: Include the right header! + +template +constexpr bool is_view_v = false; + +template +constexpr bool is_view_v> = true; + +template +concept view = is_view_v; + +using policy_t = Kokkos::RangePolicy<>; + +template +void init(D data, P pack_ids) { + Kokkos::parallel_for("Init Data", policy_t(0, data.extent(0)), + KOKKOS_LAMBDA(int i) { data(i) = i; }); + Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857); + Kokkos::fill_random(pack_ids, rand_pool64, data.extent(0)); +} + +// CUDA does not support auto return type from functions +// which create host device lambdas +template +struct pack_functor { + D data; + P pack_ids; + B buffer; + KOKKOS_FUNCTION void operator() (int i) const { + buffer(i) = data(pack_ids(i)); + } +}; + +// EXERCISE: take graph nodes instead of passing in execution space instances +// What should these functions return now? +// Use simple unconstrained templates for the graph node +template +void pack(Exec exec, D data, P pack_ids, B buffer) { + Kokkos::parallel_for("Pack One", policy_t(exec, 0, pack_ids.extent(0)), + pack_functor{data, pack_ids, buffer}); +} + +template +struct copy_functor { + Dest d; + Src s; + KOKKOS_FUNCTION void operator() (int i) const { + d(i) = s(i); + } +}; + +// EXERCISE: take graph nodes instead of passing in execution space instances +// What should these functions return now? +// Use simple unconstrained templates for the graph node +template +auto transfer(Exec exec, R recv, S send) { + Kokkos::parallel_for("DeepCopy", policy_t(exec, 0, recv.extent(0)), + copy_functor{recv, send}); + // EXERCISE the following should become a host node! + exec.fence(); + printf("HostTransfer %p %p\n",recv.data(), send.data()); +} + +template +auto unpack(Exec exec, D data, B buffer) { + Kokkos::parallel_for("DeepCopy", policy_t(exec, 0, buffer.extent(0)), + copy_functor{data, buffer}); +} + +void mpi_style_iteration(int num_elements, int num_mpi_neighs, int num_sendrecv, int num_repeat) { + Kokkos::View data("Data", num_elements + num_sendrecv); + Kokkos::View send_buffer("SendBuf", num_mpi_neighs, num_sendrecv); + Kokkos::View recv_buffer("RecvBuf", num_mpi_neighs, num_sendrecv); + Kokkos::View pack_ids("PackIDS", num_mpi_neighs, num_sendrecv); + init(data, pack_ids); + + Kokkos::Timer timer; + // EXERCISE: Create an Kokkos graph to capture work items + // Kokkos::Experimental::Graph graph; + + timer.reset(); + // EXERCISE Start creating your graph here + // Do you need the repeat here? + for(int r=0; r < num_repeat; r++) { + for(int neigh = 0; neigh < num_mpi_neighs; neigh++) { + // Create subviews for + auto my_pack_ids = Kokkos::subview(pack_ids, neigh, Kokkos::ALL()); + auto send_buf = Kokkos::subview(send_buffer, neigh, Kokkos::ALL()); + auto recv_buf = Kokkos::subview(recv_buffer, neigh, Kokkos::ALL()); + auto my_data = Kokkos::subview(data, Kokkos::pair{num_elements, (int)data.extent(0)}); + + Kokkos::DefaultExecutionSpace exec; + // EXERCISE: pass in graph nodes appropriately to connect functions + pack(exec, data, my_pack_ids, send_buf); + transfer(exec, recv_buf, send_buf); + unpack(exec, Kokkos::subview(data, Kokkos::pair{num_elements, (int)data.extent(0)}), recv_buf); + } + } + // EXERCISE: instantiate the graph object + // Kokkos::fence(); + // printf("Graph Create Done\n"); + + // EXERCISE: measure creation time here + double time = timer.seconds(); + printf("Runtime: %lf \n",time*1000); + + + // EXERCISE: ask the graph to exectute its tasks + // double time_create = timer.seconds(); + // timer.reset(); + // for(int r=0; r < num_repeat; r++) { + // EXERCISE: submit graph here! + // Kokkos::fence(); + // } + // double time = timer.seconds(); + // printf("Graph Runtime: %lf %lf\n",time*1000, time_create*1000); +} + + +int main( int argc, char* argv[] ) +{ + int64_t N = 20000; // number of elements + int neighs = 6; // number of neighbors + int num_send = 5000; // number of elements to send/recv + int nrepeat = 10; // number of repeats of the test + + // Read command line arguments. + for ( int i = 0; i < argc; i++ ) { + if ( strcmp( argv[ i ], "-N" ) == 0 ) { + N = atoi( argv[ ++i ] ); + printf( " User N is %lld\n", N ); + } + else if ( strcmp( argv[ i ], "-neighs" ) == 0 ) { + neighs = atoi( argv[ ++i ] ); + } + else if ( strcmp( argv[ i ], "-nsend" ) == 0 ) { + num_send = atoi( argv[ ++i ] ); + } + else if ( strcmp( argv[ i ], "-nrepeat" ) == 0 ) { + nrepeat = atoi( argv[ ++i ] ); + } + else if ( ( strcmp( argv[ i ], "-h" ) == 0 ) || ( strcmp( argv[ i ], "-help" ) == 0 ) ) { + printf( " -N : number of elements (default: 20000)\n" ); + printf( " -neighs : number of neighbors (default: 6)\n" ); + printf( " -nsend : number of send/recv elements (default: 5000)\n" ); + printf( " -nrepeat : number of repetitions (default: 10)\n" ); + printf( " -help (-h): print this message\n\n" ); + exit( 1 ); + } + } + + + Kokkos::initialize( argc, argv ); + { + printf("Execute with %lld %i %i %i\n",N, neighs, num_send, nrepeat); + mpi_style_iteration(N, neighs, num_send, nrepeat); + } + Kokkos::finalize(); + + return 0; +} + diff --git a/Exercises/graph/Solution/CMakeLists.txt b/Exercises/graph/Solution/CMakeLists.txt new file mode 100644 index 00000000..8c7e8b8f --- /dev/null +++ b/Exercises/graph/Solution/CMakeLists.txt @@ -0,0 +1,6 @@ +cmake_minimum_required(VERSION 3.22) +project(KokkosTutorialExecGraph) +include(../../common.cmake) + +add_executable(graph graph_solution.cpp) +target_link_libraries(graph Kokkos::kokkos) diff --git a/Exercises/graph/Solution/graph_solution.cpp b/Exercises/graph/Solution/graph_solution.cpp new file mode 100644 index 00000000..7820f6ba --- /dev/null +++ b/Exercises/graph/Solution/graph_solution.cpp @@ -0,0 +1,161 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include +#include +#include + +#include + +#include +#include + +template +constexpr bool is_view_v = false; + +template +constexpr bool is_view_v> = true; + +template +concept view = is_view_v; + +using policy_t = Kokkos::RangePolicy<>; + +template +void init(D data, P pack_ids) { + Kokkos::parallel_for("Init Data", policy_t(0, data.extent(0)), + KOKKOS_LAMBDA(int i) { data(i) = i; }); + Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857); + Kokkos::fill_random(pack_ids, rand_pool64, data.extent(0)); +} + +// CUDA does not support auto return type from functions +// which create host device lambdas +template +struct pack_functor { + D data; + P pack_ids; + B buffer; + KOKKOS_FUNCTION void operator() (int i) const { + buffer(i) = data(pack_ids(i)); + } +}; + +template +auto pack(GraphNode node, D data, P pack_ids, B buffer) { + return node.then_parallel_for("Pack One", policy_t(0, pack_ids.extent(0)), + pack_functor{data, pack_ids, buffer}); +} + +template +struct copy_functor { + Dest d; + Src s; + KOKKOS_FUNCTION void operator() (int i) const { + d(i) = s(i); + } +}; + +template +auto transfer(GraphNode node, R recv, S send) { + auto temp_node = node.then_parallel_for("DeepCopy", policy_t(0, recv.extent(0)), + copy_functor{recv, send}); + return temp_node.then_host("HostThing", [=]() { printf("HostTransfer %p %p\n",recv.data(), send.data()); }); +} + +template +auto unpack(GraphNode node, D data, B buffer) { + return node.then_parallel_for("DeepCopy", policy_t(0, buffer.extent(0)), + copy_functor{data, buffer}); +} + +void mpi_style_iteration(int num_elements, int num_mpi_neighs, int num_sendrecv, int num_repeat) { + Kokkos::View data("Data", num_elements + num_sendrecv); + Kokkos::View send_buffer("SendBuf", num_mpi_neighs, num_sendrecv); + Kokkos::View recv_buffer("RecvBuf", num_mpi_neighs, num_sendrecv); + Kokkos::View pack_ids("PackIDS", num_mpi_neighs, num_sendrecv); + init(data, pack_ids); + + Kokkos::Timer timer; + Kokkos::Experimental::Graph graph; + + for(int neigh = 0; neigh < num_mpi_neighs; neigh++) { + auto my_pack_ids = Kokkos::subview(pack_ids, neigh, Kokkos::ALL()); + auto send_buf = Kokkos::subview(send_buffer, neigh, Kokkos::ALL()); + auto recv_buf = Kokkos::subview(recv_buffer, neigh, Kokkos::ALL()); + auto node1 = pack(graph.root_node(), data, my_pack_ids, send_buf); + auto node2 = transfer(node1, recv_buf, send_buf); + auto node3 = unpack(node2, Kokkos::subview(data, Kokkos::pair{num_elements, (int)data.extent(0)}), recv_buf); + } + graph.instantiate(); + Kokkos::fence(); + printf("Graph Create Done\n"); + + double time_create = timer.seconds(); + timer.reset(); + for(int r=0; r < num_repeat; r++) { + graph.submit(); + Kokkos::fence(); + } + double time = timer.seconds(); + printf("Graph Runtime: %lf %lf\n",time*1000, time_create*1000); +} + + +int main( int argc, char* argv[] ) +{ + int64_t N = 20000; // number of elements + int neighs = 6; // number of neighbors + int num_send = 5000; // number of elements to send/recv + int nrepeat = 10; // number of repeats of the test + + // Read command line arguments. + for ( int i = 0; i < argc; i++ ) { + if ( strcmp( argv[ i ], "-N" ) == 0 ) { + N = atoi( argv[ ++i ] ); + printf( " User N is %lld\n", N ); + } + else if ( strcmp( argv[ i ], "-neighs" ) == 0 ) { + neighs = atoi( argv[ ++i ] ); + } + else if ( strcmp( argv[ i ], "-nsend" ) == 0 ) { + num_send = atoi( argv[ ++i ] ); + } + else if ( strcmp( argv[ i ], "-nrepeat" ) == 0 ) { + nrepeat = atoi( argv[ ++i ] ); + } + else if ( ( strcmp( argv[ i ], "-h" ) == 0 ) || ( strcmp( argv[ i ], "-help" ) == 0 ) ) { + printf( " -N : number of elements (default: 20000)\n" ); + printf( " -neighs : number of neighbors (default: 6)\n" ); + printf( " -nsend : number of send/recv elements (default: 5000)\n" ); + printf( " -nrepeat : number of repetitions (default: 10)\n" ); + printf( " -help (-h): print this message\n\n" ); + exit( 1 ); + } + } + + + Kokkos::initialize( argc, argv ); + { + printf("Execute with %lld %i %i %i\n",N, neighs, num_send, nrepeat); + mpi_style_iteration(N, neighs, num_send, nrepeat); + } + Kokkos::finalize(); + + return 0; +} +