From af2a85e0c848a45176c1aac159db9053009b7b49 Mon Sep 17 00:00:00 2001 From: Pierre Kestener Date: Tue, 5 Dec 2023 23:06:43 +0100 Subject: [PATCH 1/2] Update simd_warp example. --- Exercises/simd_warp/Begin/Makefile | 2 +- Exercises/simd_warp/Begin/simd_warp_begin.cpp | 20 ++++++------ Exercises/simd_warp/Solution/Makefile | 2 +- .../simd_warp/Solution/simd_warp_solution.cpp | 31 ++++++++++--------- 4 files changed, 29 insertions(+), 26 deletions(-) diff --git a/Exercises/simd_warp/Begin/Makefile b/Exercises/simd_warp/Begin/Makefile index ee6f1559..5a9365c6 100644 --- a/Exercises/simd_warp/Begin/Makefile +++ b/Exercises/simd_warp/Begin/Makefile @@ -29,7 +29,7 @@ endif CXXFLAGS ?= -O3 -g override CXXFLAGS += -I$(MAKEFILE_PATH) -override CXXFLAGS += -I$(KOKKOS_PATH)/../simd-math +override CXXFLAGS += -I$(KOKKOS_PATH)/simd/src DEPFLAGS = -M LINK = ${CXX} diff --git a/Exercises/simd_warp/Begin/simd_warp_begin.cpp b/Exercises/simd_warp/Begin/simd_warp_begin.cpp index 757553f8..3aa46f6c 100644 --- a/Exercises/simd_warp/Begin/simd_warp_begin.cpp +++ b/Exercises/simd_warp/Begin/simd_warp_begin.cpp @@ -1,16 +1,11 @@ #include //EXERCISE: include the right header (later Kokkos will include this) -//#include +//#include void test_simd(int N_in, int M, int R, double a) { //EXERCISE: get the right type here for CUDA/Non-Cuda - //#ifdef KOKKOS_ENABLE_CUDA //using simd_t = ...; - //#else - //using simd_t = ...; - //#endif - //using simd_storage_t = ...; //EXERCISE: What will the N now be? int N = N_in; @@ -33,6 +28,13 @@ void test_simd(int N_in, int M, int R, double a) { }); Kokkos::deep_copy(results_scalar,0.0); + //EXERCISE: use TeamPolicy here +#ifdef KOKKOS_ENABLE_CUDA + constexpr int team_size = ...; +#else + constexpr int team_size = ...; +#endif + Kokkos::Timer timer; for(int r = 0; r data("D",N,M); + Kokkos::View data("D",N,M); Kokkos::View results("R",N); // Lets fill the input data @@ -105,7 +107,7 @@ void test_team_vector(int N, int M, int R, double a) { int main(int argc, char* argv[]) { Kokkos::initialize(argc,argv); - int N = argc>1?atoi(argv[1]):320000; + int N = argc>1?atoi(argv[1]):3200000; int M = argc>2?atoi(argv[2]):3; int R = argc>3?atoi(argv[3]):10; double scal = argc>4?atof(argv[4]):1.5; diff --git a/Exercises/simd_warp/Solution/Makefile b/Exercises/simd_warp/Solution/Makefile index ee6f1559..5a9365c6 100644 --- a/Exercises/simd_warp/Solution/Makefile +++ b/Exercises/simd_warp/Solution/Makefile @@ -29,7 +29,7 @@ endif CXXFLAGS ?= -O3 -g override CXXFLAGS += -I$(MAKEFILE_PATH) -override CXXFLAGS += -I$(KOKKOS_PATH)/../simd-math +override CXXFLAGS += -I$(KOKKOS_PATH)/simd/src DEPFLAGS = -M LINK = ${CXX} diff --git a/Exercises/simd_warp/Solution/simd_warp_solution.cpp b/Exercises/simd_warp/Solution/simd_warp_solution.cpp index 6755fd27..19bfdd35 100644 --- a/Exercises/simd_warp/Solution/simd_warp_solution.cpp +++ b/Exercises/simd_warp/Solution/simd_warp_solution.cpp @@ -1,19 +1,14 @@ #include -#include +#include void test_simd(int N_in, int M, int R, double a) { -#ifdef KOKKOS_ENABLE_CUDA - using simd_t = simd::simd>; -#else - using simd_t = simd::simd; -#endif - using simd_storage_t = simd_t::storage_type; + using simd_t = Kokkos::Experimental::native_simd; int N = N_in/simd_t::size(); - Kokkos::View data("D",N,M); - Kokkos::View results("R",N); + Kokkos::View data("D",N,M); + Kokkos::View results("R",N); // For the final reduction we gonna need a scalar view of the data for now // Relying on knowing the data layout, we will add SIMD Layouts later @@ -21,22 +16,28 @@ void test_simd(int N_in, int M, int R, double a) { Kokkos::View data_scalar((double*)data.data(),N_in,M); Kokkos::View results_scalar((double*)results.data(),N_in); - // Lets fill the data + // Lets fill the input data using scalar view Kokkos::parallel_for("init",data_scalar.extent(0), KOKKOS_LAMBDA(const int i) { for (int j=0; j(data.extent(0),1,simd_t::size()), + Kokkos::parallel_for("Combine",Kokkos::TeamPolicy<>(data.extent(0)/team_size,team_size,simd_t::size()), KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team) { simd_t tmp = 0.0; double b = a; - const int i = team.league_rank(); + const int i = team.league_rank() * team.team_size() + team.team_rank(); for(int j=0; j data("D",N,M); + Kokkos::View data("D",N,M); Kokkos::View results("R",N); // Lets fill the input data @@ -99,7 +100,7 @@ void test_team_vector(int N, int M, int R, double a) { int main(int argc, char* argv[]) { Kokkos::initialize(argc,argv); - int N = argc>1?atoi(argv[1]):320000; + int N = argc>1?atoi(argv[1]):3200000; int M = argc>2?atoi(argv[2]):3; int R = argc>3?atoi(argv[3]):10; double scal = argc>4?atof(argv[4]):1.5; From e42ff135b3f19ef0c9e8a8707617ea7ef1826e86 Mon Sep 17 00:00:00 2001 From: Pierre Kestener Date: Tue, 5 Dec 2023 23:28:55 +0100 Subject: [PATCH 2/2] increase data size for more accurate/fair comparison. --- Exercises/simd_warp/Begin/simd_warp_begin.cpp | 2 +- Exercises/simd_warp/Solution/simd_warp_solution.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Exercises/simd_warp/Begin/simd_warp_begin.cpp b/Exercises/simd_warp/Begin/simd_warp_begin.cpp index 3aa46f6c..3b9024f0 100644 --- a/Exercises/simd_warp/Begin/simd_warp_begin.cpp +++ b/Exercises/simd_warp/Begin/simd_warp_begin.cpp @@ -107,7 +107,7 @@ void test_team_vector(int N, int M, int R, double a) { int main(int argc, char* argv[]) { Kokkos::initialize(argc,argv); - int N = argc>1?atoi(argv[1]):3200000; + int N = argc>1?atoi(argv[1]):32000000; int M = argc>2?atoi(argv[2]):3; int R = argc>3?atoi(argv[3]):10; double scal = argc>4?atof(argv[4]):1.5; diff --git a/Exercises/simd_warp/Solution/simd_warp_solution.cpp b/Exercises/simd_warp/Solution/simd_warp_solution.cpp index 19bfdd35..9bce17dc 100644 --- a/Exercises/simd_warp/Solution/simd_warp_solution.cpp +++ b/Exercises/simd_warp/Solution/simd_warp_solution.cpp @@ -100,7 +100,7 @@ void test_team_vector(int N, int M, int R, double a) { int main(int argc, char* argv[]) { Kokkos::initialize(argc,argv); - int N = argc>1?atoi(argv[1]):3200000; + int N = argc>1?atoi(argv[1]):32000000; int M = argc>2?atoi(argv[2]):3; int R = argc>3?atoi(argv[3]):10; double scal = argc>4?atof(argv[4]):1.5;