From e4445fca58b3805421060ef8384214cec70d3e44 Mon Sep 17 00:00:00 2001 From: Adrien Taberner Date: Wed, 19 Feb 2025 11:13:51 +0100 Subject: [PATCH 1/5] CI and Exercises update --- .../workflows/github-pr-linux-container.yaml | 16 +- .github/workflows/github-pr-unix.yaml | 17 +- Exercises/04/Solution/exercise_4_solution.cpp | 2 +- .../parallel_scan/Begin/parallel_scan.cpp | 16 +- .../random_number/Begin/MC_DartSampler.cpp | 38 ++- .../random_number/Solution/MC_DartSampler.cpp | 44 ++- Exercises/simd/Begin/simd_begin.cpp | 265 ++++++++++++------ Exercises/simd/Solution/simd_solution.cpp | 245 ++++++++++------ Exercises/simd_warp/Begin/CMakeLists.txt | 6 - Exercises/simd_warp/Begin/simd_warp_begin.cpp | 122 -------- Exercises/simd_warp/Solution/CMakeLists.txt | 6 - .../simd_warp/Solution/simd_warp_solution.cpp | 116 -------- .../subview/Begin/exercise_subview_begin.cpp | 10 +- .../Solution/exercise_subview_solution.cpp | 31 +- Scripts/ci-configure-build-test.bat | 19 +- Scripts/ci-configure-build-test.sh | 5 +- Scripts/ci-run-solutions.sh | 70 +++++ 17 files changed, 558 insertions(+), 470 deletions(-) delete mode 100644 Exercises/simd_warp/Begin/CMakeLists.txt delete mode 100644 Exercises/simd_warp/Begin/simd_warp_begin.cpp delete mode 100644 Exercises/simd_warp/Solution/CMakeLists.txt delete mode 100644 Exercises/simd_warp/Solution/simd_warp_solution.cpp create mode 100644 Scripts/ci-run-solutions.sh diff --git a/.github/workflows/github-pr-linux-container.yaml b/.github/workflows/github-pr-linux-container.yaml index 72cc7c4e..195779d6 100644 --- a/.github/workflows/github-pr-linux-container.yaml +++ b/.github/workflows/github-pr-linux-container.yaml @@ -60,11 +60,17 @@ jobs: run: cmake --build "${GITHUB_WORKSPACE}"/build-kokkos-kernels --config RelWithDebInfo --parallel 2 --target install - name: Configure and Build Exercises + run: > + bash "${GITHUB_WORKSPACE}"/kokkos-tutorials/Scripts/ci-configure-build-test.sh + "${GITHUB_WORKSPACE}"/install-kokkos/lib/cmake/Kokkos + "${GITHUB_WORKSPACE}"/install-kokkos-kernels + "${GITHUB_WORKSPACE}"/kokkos-tutorials + "${GITHUB_WORKSPACE}"/kokkos/bin/nvcc_wrapper + RelWithDebInfo + CUDA + + - name: Run Solutions run: | - bash "${GITHUB_WORKSPACE}"/kokkos-tutorials/Scripts/ci-configure-build-test.sh \ - "${GITHUB_WORKSPACE}"/install-kokkos/lib/cmake/Kokkos \ - "${GITHUB_WORKSPACE}"/install-kokkos-kernels \ + bash "${GITHUB_WORKSPACE}"/kokkos-tutorials/Scripts/ci-run-solutions.sh \ "${GITHUB_WORKSPACE}"/kokkos-tutorials \ - "${GITHUB_WORKSPACE}"/kokkos/bin/nvcc_wrapper \ - RelWithDebInfo \ CUDA diff --git a/.github/workflows/github-pr-unix.yaml b/.github/workflows/github-pr-unix.yaml index d33b9d9d..2d9aeba8 100644 --- a/.github/workflows/github-pr-unix.yaml +++ b/.github/workflows/github-pr-unix.yaml @@ -76,11 +76,18 @@ jobs: run: cmake --build "${GITHUB_WORKSPACE}"/build-kokkos-kernels --config ${{ matrix.build_type }} --parallel 2 --target install - name: Configure and Build Exercises + run: > + bash "${GITHUB_WORKSPACE}"/kokkos-tutorials/Scripts/ci-configure-build-test.sh + "${GITHUB_WORKSPACE}"/install-kokkos/lib/cmake/Kokkos + "${GITHUB_WORKSPACE}"/install-kokkos-kernels + "${GITHUB_WORKSPACE}"/kokkos-tutorials + ${{ matrix.cpp_compiler}} + ${{ matrix.build_type}} + ${{ matrix.backend }} + + - name: Run Solutions run: | - bash "${GITHUB_WORKSPACE}"/kokkos-tutorials/Scripts/ci-configure-build-test.sh \ - "${GITHUB_WORKSPACE}"/install-kokkos/lib/cmake/Kokkos \ - "${GITHUB_WORKSPACE}"/install-kokkos-kernels \ + bash "${GITHUB_WORKSPACE}"/kokkos-tutorials/Scripts/ci-run-solutions.sh \ "${GITHUB_WORKSPACE}"/kokkos-tutorials \ - ${{ matrix.cpp_compiler}} \ - ${{ matrix.build_type}} \ ${{ matrix.backend }} + diff --git a/Exercises/04/Solution/exercise_4_solution.cpp b/Exercises/04/Solution/exercise_4_solution.cpp index 6601fb77..abbbc314 100644 --- a/Exercises/04/Solution/exercise_4_solution.cpp +++ b/Exercises/04/Solution/exercise_4_solution.cpp @@ -70,7 +70,7 @@ int main( int argc, char* argv[] ) #define MemSpace Kokkos::CudaSpace #endif #ifdef KOKKOS_ENABLE_HIP - #define MemSpace Kokkos::Experimental::HIPSpace + #define MemSpace Kokkos::HIPSpace #endif #ifdef KOKKOS_ENABLE_OPENMPTARGET #define MemSpace Kokkos::OpenMPTargetSpace diff --git a/Exercises/parallel_scan/Begin/parallel_scan.cpp b/Exercises/parallel_scan/Begin/parallel_scan.cpp index 059c6bcb..a0124a16 100644 --- a/Exercises/parallel_scan/Begin/parallel_scan.cpp +++ b/Exercises/parallel_scan/Begin/parallel_scan.cpp @@ -18,7 +18,21 @@ #include template struct Factorial { - /* EXERCISE */ + using value_type = ValueType; + + Factorial(Kokkos::View view) : m_view(view) {} + + // EXERCISE: Implement the init method + // void init(...) {...} + + // EXERCISE: Implement the join method + // void join(...) {...} + + // EXERCISE: Implement the operator() method + // void operator()(...) const {...} + +private: + Kokkos::View m_view; }; int main(int argc, char *argv[]) { diff --git a/Exercises/random_number/Begin/MC_DartSampler.cpp b/Exercises/random_number/Begin/MC_DartSampler.cpp index b9696734..c9a1d5b6 100644 --- a/Exercises/random_number/Begin/MC_DartSampler.cpp +++ b/Exercises/random_number/Begin/MC_DartSampler.cpp @@ -155,22 +155,28 @@ struct GenRandom { // 1) cycle on the sample size and compare pi vs sample size. // 2) integer bit-size variation (64 vs 1024). +void checkSizes(int& N, int& dart_groups); int main(int argc, char* args[]) { - if ( argc < 2 ) { - printf("RNG Example: Need at least one argument (number darts) to run; second optional argument for serial_iterations\n"); - return (-1); + int N = -1; // Number of darts, 2^N + int dart_groups = -1; // Number of darts to draw per thread + + if ( argc > 1 ) { + N = std::atoi(args[1]); + printf("User N is %d\n", N); + } + if ( argc > 2 ) { + dart_groups = std::atoi(args[2]); + printf("User dart_groups is %d\n", dart_groups); } + checkSizes(N, dart_groups); + Kokkos::initialize(argc,args); { - const double rad = 1.0; // target radius (also box size) - const long N = atoi(args[1]); // exponent used to create number of darts, 2^N - - const long dart_groups = argc > 2 ? atoi(args[2]) : 1 ; - - const long darts = std::pow(2,N); // number of dart throws + const double rad = 1.0; // target radius (also box size) + const long darts = std::pow(2,N); // number of dart throws const double pi = 3.14159265358979323846 ; printf( "Reference Value for pi: %lf\n",pi); @@ -194,3 +200,17 @@ int main(int argc, char* args[]) { return 0; } +void checkSizes(int& N, int& dart_groups) +{ + if ( N == -1 && dart_groups == -1 ) { + printf("RNG Example Options:\n"); + printf(" : Number of darts 2^N (default: 2^22)\n"); + printf(" : Number of darts to draw per thread (default: 1)\n"); + } + if ( N == -1 ) { + N = 22; + } + if ( dart_groups == -1 ) { + dart_groups = 1; + } +} diff --git a/Exercises/random_number/Solution/MC_DartSampler.cpp b/Exercises/random_number/Solution/MC_DartSampler.cpp index e107f550..f8ed0ed5 100644 --- a/Exercises/random_number/Solution/MC_DartSampler.cpp +++ b/Exercises/random_number/Solution/MC_DartSampler.cpp @@ -143,22 +143,28 @@ struct GenRandom { // 1) cycle on the sample size and compare pi vs sample size. // 2) integer bit-size variation (64 vs 1024). +void checkSizes(int& N, int& dart_groups); int main(int argc, char* args[]) { - if ( argc < 2 ) { - printf("RNG Example: Need at least one argument (number darts) to run; second optional argument for serial_iterations\n"); - return (-1); + int N = -1; // Number of darts, 2^N + int dart_groups = -1; // Number of darts to draw per thread + + if ( argc > 1 ) { + N = std::atoi(args[1]); + printf("User N is %d\n", N); + } + if ( argc > 2 ) { + dart_groups = std::atoi(args[2]); + printf("User dart_groups is %d\n", dart_groups); } + checkSizes(N, dart_groups); + Kokkos::initialize(argc,args); { - const double rad = 1.0; // target radius (also box size) - const long N = atoi(args[1]); // exponent used to create number of darts, 2^N - - const long dart_groups = argc > 2 ? atoi(args[2]) : 1 ; - - const long darts = std::pow(2,N); // number of dart throws + const double rad = 1.0; // target radius (also box size) + const long darts = std::pow(2,N); // number of dart throws const double pi = 3.14159265358979323846 ; printf( "Reference Value for pi: %lf\n",pi); @@ -176,8 +182,22 @@ int main(int argc, char* args[]) { printf( "darts = %ld hits = %ld pi est = %lf\n", darts, circHits, 4.0*double(circHits)/double(darts) ); } - Kokkos::finalize(); - - return 0; + Kokkos::finalize(); + + return 0; } +void checkSizes(int& N, int& dart_groups) +{ + if ( N == -1 && dart_groups == -1 ) { + printf("RNG Example Options:\n"); + printf(" : Number of darts 2^N (default: 2^22)\n"); + printf(" : Number of darts to draw per thread (default: 1)\n"); + } + if ( N == -1 ) { + N = 22; + } + if ( dart_groups == -1 ) { + dart_groups = 1; + } +} diff --git a/Exercises/simd/Begin/simd_begin.cpp b/Exercises/simd/Begin/simd_begin.cpp index fc047715..afd10494 100644 --- a/Exercises/simd/Begin/simd_begin.cpp +++ b/Exercises/simd/Begin/simd_begin.cpp @@ -1,112 +1,199 @@ -#include -//EXERCISE: include the right header (later Kokkos will include this) -//#include +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@ + +// EXERCISE SIMD Goals +// Create views with the simd type +// Create unmanaged view to have a scalar acess +// Adapt the parallel reduce loop + +#include +#include +#include +#include +#include + +#include +// EXERCISE: Include Kokkos SIMD header file. +// #include + +#define MemSpace Kokkos::HostSpace +#define Layout Kokkos::LayoutRight + +using ExecSpace = MemSpace::execution_space; +using range_policy = Kokkos::RangePolicy; + +void checkSizes( int &N, int &M, int &S, int &nrepeat ); + +int main( int argc, char* argv[] ) +{ + int N = -1; // number of rows 2^12 + int M = -1; // number of columns 2^10 + int S = -1; // total size 2^22 + int nrepeat = 100; // number of repeats of the test + + // Read command line arguments. + for ( int i = 0; i < argc; i++ ) { + if ( ( strcmp( argv[ i ], "-N" ) == 0 ) || ( strcmp( argv[ i ], "-Rows" ) == 0 ) ) { + N = pow( 2, atoi( argv[ ++i ] ) ); + printf( " User N is %d\n", N ); + } + else if ( ( strcmp( argv[ i ], "-M" ) == 0 ) || ( strcmp( argv[ i ], "-Columns" ) == 0 ) ) { + M = pow( 2, atof( argv[ ++i ] ) ); + printf( " User M is %d\n", M ); + } + else if ( ( strcmp( argv[ i ], "-S" ) == 0 ) || ( strcmp( argv[ i ], "-Size" ) == 0 ) ) { + S = pow( 2, atof( argv[ ++i ] ) ); + printf( " User S is %d\n", S ); + } + else if ( strcmp( argv[ i ], "-nrepeat" ) == 0 ) { + nrepeat = atoi( argv[ ++i ] ); + } + else if ( ( strcmp( argv[ i ], "-h" ) == 0 ) || ( strcmp( argv[ i ], "-help" ) == 0 ) ) { + printf( " y^T*A*x Options:\n" ); + printf( " -Rows (-N) : exponent num, determines number of rows 2^num (default: 2^12 = 4096)\n" ); + printf( " -Columns (-M) : exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n" ); + printf( " -Size (-S) : exponent num, determines total matrix size 2^num (default: 2^22 = 4096*1024 )\n" ); + printf( " -nrepeat : number of repetitions (default: 100)\n" ); + printf( " -help (-h): print this message\n\n" ); + exit( 1 ); + } + } -void test_simd(int N_in, int M, int R, double a) { + // Check sizes. + checkSizes( N, M, S, nrepeat ); - //EXERCISE: get the right type here + Kokkos::initialize( argc, argv ); + { + //EXERCISE: Get the right type here. //using simd_t = ...; + + //EXERCISE: What will be the limit of the M loop ? + int M_simd = M; + + // EXERCISE: Create SIMD Views with the right dimensions. + Kokkos::View x( "x", M); + Kokkos::View A( "A", N, M); + + // EXERCISE: Create unmanaged scalar views for x and A. + // x_scalar((double*)x.data(), M); + // A_scalar((double*)A.data(), N, M); + Kokkos::View y( "y", N); + + // Initialize y vector. + for ( int i = 0; i < N; ++i ) { + y( i ) = 1; + } + // EXERCISE: Initialize the scalar view of x vector. + for ( int i = 0; i < M; ++i ) { + x( i ) = 1; + } + // EXERCISE: Initialize the scalar view of A matrix. + for ( int j = 0; j < N; ++j ) { + for ( int i = 0; i < M; ++i ) { + A( j, i ) = 1; + } + } + + // Timer products. + Kokkos::Timer timer; - //EXERCISE: What will the N now be? - int N = N_in; - - //EXERCISE: create SIMD Views instead - Kokkos::View data("D",N,M); - Kokkos::View results("R",N); - - // EXERCISE: create correctly a scalar view of of data and results - // For the final reduction we gonna need a scalar view of the data for now - // Relying on knowing the data layout, we will add SIMD Layouts later - // so that simple copy construction/assignment would work - Kokkos::View data_scalar(data); - Kokkos::View results_scalar(results); - - // Lets fill the input data using scalar view - Kokkos::parallel_for("init",data_scalar.extent(0), KOKKOS_LAMBDA(const int i) { - for (int j=0; j = y^T*A*x + double result = 0; - Kokkos::Timer timer; - for(int r = 0; r data("D",N,M); - Kokkos::View results("R",N); + // Calculate bandwidth. + // Each matrix A row (each of length M) is read once. + // The x vector (of length M) is read N times. + // The y vector (of length N) is read once. + // double Gbytes = 1.0e-9 * double( sizeof(double) * ( 2 * M * N + N ) ); + double Gbytes = 1.0e-9 * double( sizeof(double) * ( M + M * N + N ) ); - // Lets fill the input data - Kokkos::parallel_for("init",data.extent(0), KOKKOS_LAMBDA(const int i) { - for (int j=0; j 1024 ) { + M = 1024; + } + else { + M = S; + } + } + // If only M is undefined, set it. + if ( M == -1 ) M = S / N; -int main(int argc, char* argv[]) { - Kokkos::initialize(argc,argv); + // If N is undefined, set it. + if ( N == -1 ) N = S / M; - int N = argc>1?atoi(argv[1]):320000; - int M = argc>2?atoi(argv[2]):3; - int R = argc>3?atoi(argv[3]):10; - double scal = argc>4?atof(argv[4]):1.5; + printf( " Total size S = %d N = %d M = %d\n", S, N, M ); - if(N%32) { - printf("Please choose an N dividable by 32\n"); - return 0; + // Check sizes. + if ( ( S < 0 ) || ( N < 0 ) || ( M < 0 ) || ( nrepeat < 0 ) ) { + printf( " Sizes must be greater than 0.\n" ); + exit( 1 ); } - test_scalar(N,M,R,scal); - test_simd(N,M,R,scal); - - Kokkos::finalize(); + if ( ( N * M ) != S ) { + printf( " N * M != S\n" ); + exit( 1 ); + } } diff --git a/Exercises/simd/Solution/simd_solution.cpp b/Exercises/simd/Solution/simd_solution.cpp index 26da7b0f..919cc285 100644 --- a/Exercises/simd/Solution/simd_solution.cpp +++ b/Exercises/simd/Solution/simd_solution.cpp @@ -1,105 +1,192 @@ -#include -#include +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include +#include +#include +#include + +#include +#include + +#define MemSpace Kokkos::HostSpace +#define Layout Kokkos::LayoutRight + +using ExecSpace = MemSpace::execution_space; +using range_policy = Kokkos::RangePolicy; + +void checkSizes( int &N, int &M, int &S, int &nrepeat ); + +int main( int argc, char* argv[] ) +{ + int N = -1; // number of rows 2^12 + int M = -1; // number of columns 2^10 + int S = -1; // total size 2^22 + int nrepeat = 100; // number of repeats of the test + + // Read command line arguments. + for ( int i = 0; i < argc; i++ ) { + if ( ( strcmp( argv[ i ], "-N" ) == 0 ) || ( strcmp( argv[ i ], "-Rows" ) == 0 ) ) { + N = pow( 2, atoi( argv[ ++i ] ) ); + printf( " User N is %d\n", N ); + } + else if ( ( strcmp( argv[ i ], "-M" ) == 0 ) || ( strcmp( argv[ i ], "-Columns" ) == 0 ) ) { + M = pow( 2, atof( argv[ ++i ] ) ); + printf( " User M is %d\n", M ); + } + else if ( ( strcmp( argv[ i ], "-S" ) == 0 ) || ( strcmp( argv[ i ], "-Size" ) == 0 ) ) { + S = pow( 2, atof( argv[ ++i ] ) ); + printf( " User S is %d\n", S ); + } + else if ( strcmp( argv[ i ], "-nrepeat" ) == 0 ) { + nrepeat = atoi( argv[ ++i ] ); + } + else if ( ( strcmp( argv[ i ], "-h" ) == 0 ) || ( strcmp( argv[ i ], "-help" ) == 0 ) ) { + printf( " y^T*A*x Options:\n" ); + printf( " -Rows (-N) : exponent num, determines number of rows 2^num (default: 2^12 = 4096)\n" ); + printf( " -Columns (-M) : exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n" ); + printf( " -Size (-S) : exponent num, determines total matrix size 2^num (default: 2^22 = 4096*1024 )\n" ); + printf( " -nrepeat : number of repetitions (default: 100)\n" ); + printf( " -help (-h): print this message\n\n" ); + exit( 1 ); + } + } -void test_simd(int N_in, int M, int R, double a) { + // Check sizes. + checkSizes( N, M, S, nrepeat ); + Kokkos::initialize( argc, argv ); + { + // Use SIMD with double precision. using simd_t = Kokkos::Experimental::native_simd; + // Compute the new loop limit for SIMD data type. + int M_simd = M / simd_t::size(); - int N = N_in/simd_t::size(); - - Kokkos::View data("D",N,M); - Kokkos::View results("R",N); + // Create simd views. + Kokkos::View x( "x", M_simd);; + Kokkos::View A( "A", N, M_simd); - // For the final reduction we gonna need a scalar view of the data for now - // Relying on knowing the data layout, we will add SIMD Layouts later - // so that simple copy construction/assignment would work - Kokkos::View data_scalar((double*)data.data(),N_in,M); - Kokkos::View results_scalar((double*)results.data(),N_in); + // Unmanaged views for initialization. + Kokkos::View x_scalar((double*)x.data(), M); + Kokkos::View A_scalar((double*)A.data(), N, M); + Kokkos::View y( "y", N); - // Lets fill the input data using scalar view - Kokkos::parallel_for("init",data_scalar.extent(0), KOKKOS_LAMBDA(const int i) { - for (int j=0; j = y^T*A*x + double result = 0; + + Kokkos::parallel_reduce( "yAx", range_policy( 0, N ), KOKKOS_LAMBDA ( int j, double &update ) { + simd_t temp2(0.0); + for ( int i = 0; i < M_simd; ++i ) { + temp2 += A( j, i ) * x( i ); + } + // Accumulate the result for each SIMD lane. + for ( std::size_t i = 0; i < simd_t::size(); ++i) { + update += y(j) * temp2[i]; } - results(i) = tmp; - }); - Kokkos::fence(); + }, result ); + + // Output result. + if ( repeat == ( nrepeat - 1 ) ) { + printf( " Computed result for %d x %d is %lf\n", N, M, result ); + } + + const double solution = (double) N * (double) M; + + if ( result != solution ) { + printf( " Error: result( %lf ) != solution( %lf )\n", result, solution ); + } } + // Calculate time. double time = timer.seconds(); - double value = 0.0; - // Lets do the reduction here - Kokkos::parallel_reduce("Reduce",results_scalar.extent(0), KOKKOS_LAMBDA(const int i, double& lsum) { - lsum += results_scalar(i); - },value); + // Calculate bandwidth. + // Each matrix A row (each of length M) is read once. + // The x vector (of length M) is read N times. + // The y vector (of length N) is read once. + // double Gbytes = 1.0e-9 * double( sizeof(double) * ( 2 * M * N + N ) ); + double Gbytes = 1.0e-9 * double( sizeof(double) * ( M + M * N + N ) ); - printf("SIMD Time: %lf ms ( %e )\n",time*1000,value); -} - -void test_scalar(int N, int M, int R, double a) { - - Kokkos::View data("D",N,M); - Kokkos::View results("R",N); + // Print results (problem size, time and bandwidth in GB/s). + printf( " N( %d ) M( %d ) nrepeat ( %d ) problem( %g MB ) time( %g s ) bandwidth( %g GB/s )\n", + N, M, nrepeat, Gbytes * 1000, time, Gbytes * nrepeat / time ); - // Lets fill the input data - Kokkos::parallel_for("init",data.extent(0), KOKKOS_LAMBDA(const int i) { - for (int j=0; j 1024 ) { + M = 1024; + } + else { + M = S; + } + } + // If only M is undefined, set it. + if ( M == -1 ) M = S / N; -int main(int argc, char* argv[]) { - Kokkos::initialize(argc,argv); + // If N is undefined, set it. + if ( N == -1 ) N = S / M; - int N = argc>1?atoi(argv[1]):320000; - int M = argc>2?atoi(argv[2]):3; - int R = argc>3?atoi(argv[3]):10; - double scal = argc>4?atof(argv[4]):1.5; + printf( " Total size S = %d N = %d M = %d\n", S, N, M ); - if(N%32) { - printf("Please choose an N dividable by 32\n"); - return 0; + // Check sizes. + if ( ( S < 0 ) || ( N < 0 ) || ( M < 0 ) || ( nrepeat < 0 ) ) { + printf( " Sizes must be greater than 0.\n" ); + exit( 1 ); } - test_scalar(N,M,R,scal); - test_simd(N,M,R,scal); - - Kokkos::finalize(); + if ( ( N * M ) != S ) { + printf( " N * M != S\n" ); + exit( 1 ); + } } diff --git a/Exercises/simd_warp/Begin/CMakeLists.txt b/Exercises/simd_warp/Begin/CMakeLists.txt deleted file mode 100644 index 83dcecbe..00000000 --- a/Exercises/simd_warp/Begin/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -cmake_minimum_required(VERSION 3.16) -project(KokkosTutorialSIMDWarp) -include(../../common.cmake) - -add_executable(SIMDWarp simd_warp_begin.cpp) -target_link_libraries(SIMDWarp Kokkos::kokkos) diff --git a/Exercises/simd_warp/Begin/simd_warp_begin.cpp b/Exercises/simd_warp/Begin/simd_warp_begin.cpp deleted file mode 100644 index 757553f8..00000000 --- a/Exercises/simd_warp/Begin/simd_warp_begin.cpp +++ /dev/null @@ -1,122 +0,0 @@ -#include -//EXERCISE: include the right header (later Kokkos will include this) -//#include - -void test_simd(int N_in, int M, int R, double a) { - - //EXERCISE: get the right type here for CUDA/Non-Cuda - //#ifdef KOKKOS_ENABLE_CUDA - //using simd_t = ...; - //#else - //using simd_t = ...; - //#endif - //using simd_storage_t = ...; - - //EXERCISE: What will the N now be? - int N = N_in; - - //EXERCISE: create SIMD Views instead - Kokkos::View data("D",N,M); - Kokkos::View results("R",N); - - // EXERCISE: create correctly a scalar view of results and data - // For the final reduction we gonna need a scalar view of the data for now - // Relying on knowing the data layout, we will add SIMD Layouts later - // so that simple copy construction/assgnment would work - Kokkos::View data_scalar(data); - Kokkos::View results_scalar(results); - - // Lets fill the data deep_copy into scalar types doesn't work correctly for cuda_warp right now - Kokkos::parallel_for("init",data_scalar.extent(0), KOKKOS_LAMBDA(const int i) { - for (int j=0; j data("D",N,M); - Kokkos::View results("R",N); - - // Lets fill the input data - Kokkos::parallel_for("init",data.extent(0), KOKKOS_LAMBDA(const int i) { - for (int j=0; j(data.extent(0)/V,1,V), - KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team) { - double b = a; - const int i = team.league_rank()*V; - for(int j=0; j1?atoi(argv[1]):320000; - int M = argc>2?atoi(argv[2]):3; - int R = argc>3?atoi(argv[3]):10; - double scal = argc>4?atof(argv[4]):1.5; - - if(N%32) { - printf("Please choose an N dividable by 32\n"); - return 0; - } - - test_team_vector(N,M,R,scal); - test_simd(N,M,R,scal); - - Kokkos::finalize(); -} diff --git a/Exercises/simd_warp/Solution/CMakeLists.txt b/Exercises/simd_warp/Solution/CMakeLists.txt deleted file mode 100644 index 80cc6654..00000000 --- a/Exercises/simd_warp/Solution/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -cmake_minimum_required(VERSION 3.16) -project(KokkosTutorialSIMDWarp) -include(../../common.cmake) - -add_executable(SIMDWarp simd_warp_solution.cpp) -target_link_libraries(SIMDWarp Kokkos::kokkos) diff --git a/Exercises/simd_warp/Solution/simd_warp_solution.cpp b/Exercises/simd_warp/Solution/simd_warp_solution.cpp deleted file mode 100644 index 6755fd27..00000000 --- a/Exercises/simd_warp/Solution/simd_warp_solution.cpp +++ /dev/null @@ -1,116 +0,0 @@ -#include -#include - -void test_simd(int N_in, int M, int R, double a) { - -#ifdef KOKKOS_ENABLE_CUDA - using simd_t = simd::simd>; -#else - using simd_t = simd::simd; -#endif - using simd_storage_t = simd_t::storage_type; - - int N = N_in/simd_t::size(); - - Kokkos::View data("D",N,M); - Kokkos::View results("R",N); - - // For the final reduction we gonna need a scalar view of the data for now - // Relying on knowing the data layout, we will add SIMD Layouts later - // so that simple copy construction/assignment would work - Kokkos::View data_scalar((double*)data.data(),N_in,M); - Kokkos::View results_scalar((double*)results.data(),N_in); - - // Lets fill the data - Kokkos::parallel_for("init",data_scalar.extent(0), KOKKOS_LAMBDA(const int i) { - for (int j=0; j(data.extent(0),1,simd_t::size()), - KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team) { - simd_t tmp = 0.0; - double b = a; - const int i = team.league_rank(); - for(int j=0; j data("D",N,M); - Kokkos::View results("R",N); - - // Lets fill the input data - Kokkos::parallel_for("init",data.extent(0), KOKKOS_LAMBDA(const int i) { - for (int j=0; j(data.extent(0)/V,1,V), - KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team) { - double b = a; - const int i = team.league_rank()*V; - for(int j=0; j1?atoi(argv[1]):320000; - int M = argc>2?atoi(argv[2]):3; - int R = argc>3?atoi(argv[3]):10; - double scal = argc>4?atof(argv[4]):1.5; - - if(N%32) { - printf("Please choose an N dividable by 32\n"); - return 0; - } - - test_team_vector(N,M,R,scal); - test_simd(N,M,R,scal); - - Kokkos::finalize(); -} diff --git a/Exercises/subview/Begin/exercise_subview_begin.cpp b/Exercises/subview/Begin/exercise_subview_begin.cpp index e656c0f9..3a071701 100644 --- a/Exercises/subview/Begin/exercise_subview_begin.cpp +++ b/Exercises/subview/Begin/exercise_subview_begin.cpp @@ -71,14 +71,16 @@ int main( int argc, char* argv[] ) Kokkos::initialize( argc, argv ); { - // using ExecSpace = Kokkos::Serial; + using ExecSpace = Kokkos::Serial; // using ExecSpace = Kokkos::Threads; // using ExecSpace = Kokkos::OpenMP; - using ExecSpace = Kokkos::Cuda; + // using ExecSpace = Kokkos::Cuda; + // using ExecSpace = Kokkos::HIP; - // using MemSpace = Kokkos::HostSpace; + using MemSpace = Kokkos::HostSpace; // using MemSpace = Kokkos::OpenMP; - using MemSpace = Kokkos::CudaSpace; + // using MemSpace = Kokkos::CudaSpace; + // using MemSpace = Kokkos::HIPSpace; // using MemSpace = Kokkos::CudaUVMSpace; using Layout = Kokkos::LayoutLeft; diff --git a/Exercises/subview/Solution/exercise_subview_solution.cpp b/Exercises/subview/Solution/exercise_subview_solution.cpp index 981dcba9..d44f9d7e 100644 --- a/Exercises/subview/Solution/exercise_subview_solution.cpp +++ b/Exercises/subview/Solution/exercise_subview_solution.cpp @@ -65,17 +65,26 @@ int main( int argc, char* argv[] ) Kokkos::initialize( argc, argv ); { - // using ExecSpace = Kokkos::Serial; - // using ExecSpace = Kokkos::Threads; - using ExecSpace = Kokkos::OpenMP; - // using ExecSpace = Kokkos::Cuda; - // using ExecSpace = Kokkos::HIP; - - // using MemSpace = Kokkos::HostSpace; - using MemSpace = Kokkos::OpenMP; - // using MemSpace = Kokkos::CudaSpace; - // using MemSpace = Kokkos::CudaUVMSpace; - // using MemSpace = Kokkos::HIPSpace; + #ifdef KOKKOS_ENABLE_CUDA + #define MemSpace Kokkos::CudaSpace + #define ExecSpace Kokkos::Cuda + #endif + #ifdef KOKKOS_ENABLE_HIP + #define MemSpace Kokkos::HIPSpace + #define ExecSpace Kokkos::HIP + #endif + #ifdef KOKKOS_ENABLE_THREADS + #define MemSpace Kokkos::HostSpace + #define ExecSpace Kokkos::Threads + #endif + + #ifndef MemSpace + #define MemSpace Kokkos::HostSpace + #endif + + #ifndef ExecSpace + #define ExecSpace Kokkos::Serial + #endif // using Layout = Kokkos::LayoutLeft; using Layout = Kokkos::LayoutRight; diff --git a/Scripts/ci-configure-build-test.bat b/Scripts/ci-configure-build-test.bat index 1e0e22b4..77846dfa 100644 --- a/Scripts/ci-configure-build-test.bat +++ b/Scripts/ci-configure-build-test.bat @@ -12,7 +12,24 @@ set cpp_compiler=%~3 set build_type=%~4 set backend=%~5 -set "EXERCISES=01 02 03" +set EXERCISES=01 02 03 +set EXERCISES=%EXERCISES% dualview +set EXERCISES=%EXERCISES% kokkoskernels/BlockJacobi +set EXERCISES=%EXERCISES% kokkoskernels/GaussSeidel +set EXERCISES=%EXERCISES% kokkoskernels/GraphColoring +set EXERCISES=%EXERCISES% kokkoskernels/InnerProduct +set EXERCISES=%EXERCISES% mdrange +set EXERCISES=%EXERCISES% mpi_pack_unpack +set EXERCISES=%EXERCISES% parallel_scan +set EXERCISES=%EXERCISES% random_number +set EXERCISES=%EXERCISES% scatter_view +set EXERCISES=%EXERCISES% simd +set EXERCISES=%EXERCISES% subview +set EXERCISES=%EXERCISES% team_policy +set EXERCISES=%EXERCISES% team_scratch_memory +set EXERCISES=%EXERCISES% team_vector_loop +set EXERCISES=%EXERCISES% unordered_map + if "%backend%"=="CUDA" set "EXERCISES=%EXERCISES% 04" set Kokkos_ROOT=%kokkos_root% diff --git a/Scripts/ci-configure-build-test.sh b/Scripts/ci-configure-build-test.sh index ce3f4bab..1e5fdb48 100644 --- a/Scripts/ci-configure-build-test.sh +++ b/Scripts/ci-configure-build-test.sh @@ -13,9 +13,6 @@ backend="$6" # TODO: advanced_reductions seems broken # TODO: hpcbind does not use cmake # TODO: instances does not use cmake -# TODO: parallel_scan seems broken -# TODO: simd_warp seems broken -# TODO: subview seems broken # TODO: vectorshift needs Kokkos Remote Spaces # TODO: kokkoskernels/CGSolve_SpILUKprecond needs to know where Kokkos Kernels source directory is # TODO: kokkoskernels/SpILUK needs to know where Kokkos Kernels source directory is @@ -32,9 +29,11 @@ kokkoskernels/GraphColoring kokkoskernels/InnerProduct mdrange mpi_pack_unpack +parallel_scan random_number scatter_view simd +subview team_policy team_scratch_memory team_vector_loop diff --git a/Scripts/ci-run-solutions.sh b/Scripts/ci-run-solutions.sh new file mode 100644 index 00000000..2e38c349 --- /dev/null +++ b/Scripts/ci-run-solutions.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +set -eou pipefail + +tutorials_src="$1" +backend="$2" + +# These are exercises with executables that can be run in the Solution subdirectory +# TODO: advanced_reductions seems broken +# TODO: hpcbind does not use cmake +# TODO: instances does not use cmake +# TODO: vectorshift needs Kokkos Remote Spaces +# TODO: kokkoskernels/CGSolve_SpILUKprecond needs to know where Kokkos Kernels source directory is +# TODO: kokkoskernels/SpILUK needs to know where Kokkos Kernels source directory is +# TODO: kokkoskernels/TeamGemm seems broken +# TODO: mpi_heat_conduction/no-mpi does not use cmake +# TODO: mpi_pack_unpack need to be run with MPI +SOLUTION_EXERCISES=( +01 +02 +03 +dualview +kokkoskernels/BlockJacobi +kokkoskernels/GaussSeidel +kokkoskernels/GraphColoring +kokkoskernels/InnerProduct +mdrange +parallel_scan +random_number +scatter_view +simd +subview +team_policy +team_scratch_memory +team_vector_loop +unordered_map +) + +if [ "$backend" == CUDA ]; then + SOLUTION_EXERCISES+=(04) + SOLUTION_EXERCISES+=(multi_gpu_cuda) +fi + +if [ ! "$backend" == CUDA ]; then + SOLUTION_EXERCISES+=(tasking) + SOLUTION_EXERCISES+=(virtualfunction) +fi + +if [ "$backend" == OPENMP ]; then + SOLUTION_EXERCISES+=(unique_token) + export OMP_PROC_BIND=spread + export OMP_PLACES=threads +fi + +if [[ ! "$OSTYPE" == "darwin"* ]]; then + SOLUTION_EXERCISES+=(fortran-kokkosinterface) +fi + +for e in "${SOLUTION_EXERCISES[@]}"; do +solution_dir="build/Exercises/$e/Solution" + if [ -d "$solution_dir" ]; then + # Executable doesen't follow a naming convention + for executable in "$solution_dir"/*; do + if [ -x "$executable" ] && [ ! -d "$executable" ]; then + echo "Running $executable" + "$executable" + fi + done + fi +done From 62a41eaa79f4e8578cf26141aa0f92888fe084c0 Mon Sep 17 00:00:00 2001 From: Adrien Taberner Date: Fri, 21 Feb 2025 10:28:17 +0100 Subject: [PATCH 2/5] Test CI --- .../workflows/github-pr-linux-container.yaml | 16 +++++--------- .github/workflows/github-pr-unix.yaml | 4 +++- Exercises/04/Solution/exercise_4_solution.cpp | 15 +------------ .../Solution/exercise_subview_solution.cpp | 22 ++----------------- Scripts/ci-run-solutions.sh | 1 - 5 files changed, 11 insertions(+), 47 deletions(-) diff --git a/.github/workflows/github-pr-linux-container.yaml b/.github/workflows/github-pr-linux-container.yaml index 195779d6..72cc7c4e 100644 --- a/.github/workflows/github-pr-linux-container.yaml +++ b/.github/workflows/github-pr-linux-container.yaml @@ -60,17 +60,11 @@ jobs: run: cmake --build "${GITHUB_WORKSPACE}"/build-kokkos-kernels --config RelWithDebInfo --parallel 2 --target install - name: Configure and Build Exercises - run: > - bash "${GITHUB_WORKSPACE}"/kokkos-tutorials/Scripts/ci-configure-build-test.sh - "${GITHUB_WORKSPACE}"/install-kokkos/lib/cmake/Kokkos - "${GITHUB_WORKSPACE}"/install-kokkos-kernels - "${GITHUB_WORKSPACE}"/kokkos-tutorials - "${GITHUB_WORKSPACE}"/kokkos/bin/nvcc_wrapper - RelWithDebInfo - CUDA - - - name: Run Solutions run: | - bash "${GITHUB_WORKSPACE}"/kokkos-tutorials/Scripts/ci-run-solutions.sh \ + bash "${GITHUB_WORKSPACE}"/kokkos-tutorials/Scripts/ci-configure-build-test.sh \ + "${GITHUB_WORKSPACE}"/install-kokkos/lib/cmake/Kokkos \ + "${GITHUB_WORKSPACE}"/install-kokkos-kernels \ "${GITHUB_WORKSPACE}"/kokkos-tutorials \ + "${GITHUB_WORKSPACE}"/kokkos/bin/nvcc_wrapper \ + RelWithDebInfo \ CUDA diff --git a/.github/workflows/github-pr-unix.yaml b/.github/workflows/github-pr-unix.yaml index 2d9aeba8..af896128 100644 --- a/.github/workflows/github-pr-unix.yaml +++ b/.github/workflows/github-pr-unix.yaml @@ -3,7 +3,7 @@ name: Hosted Runners (unix) on: push: pull_request: - branches: [ "main" ] + branches: [ "ci-test" ] workflow_dispatch: jobs: @@ -59,6 +59,8 @@ jobs: -DCMAKE_INSTALL_PREFIX="${GITHUB_WORKSPACE}"/install-kokkos -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + -DKokkos_ENABLE_${{ matrix.backend }}=ON + -DKokkos_ENABLE_SERIAL=ON -DKokkos_ENABLE_COMPILER_WARNINGS=ON - name: Build & Install Kokkos diff --git a/Exercises/04/Solution/exercise_4_solution.cpp b/Exercises/04/Solution/exercise_4_solution.cpp index abbbc314..694cd4ec 100644 --- a/Exercises/04/Solution/exercise_4_solution.cpp +++ b/Exercises/04/Solution/exercise_4_solution.cpp @@ -66,20 +66,7 @@ int main( int argc, char* argv[] ) Kokkos::initialize( argc, argv ); { - #ifdef KOKKOS_ENABLE_CUDA - #define MemSpace Kokkos::CudaSpace - #endif - #ifdef KOKKOS_ENABLE_HIP - #define MemSpace Kokkos::HIPSpace - #endif - #ifdef KOKKOS_ENABLE_OPENMPTARGET - #define MemSpace Kokkos::OpenMPTargetSpace - #endif - - #ifndef MemSpace - #define MemSpace Kokkos::HostSpace - #endif - + using MemSpace = Kokkos::DefaultExecutionSpace::memory_space; using ExecSpace = MemSpace::execution_space; using range_policy = Kokkos::RangePolicy; diff --git a/Exercises/subview/Solution/exercise_subview_solution.cpp b/Exercises/subview/Solution/exercise_subview_solution.cpp index d44f9d7e..406e9f7b 100644 --- a/Exercises/subview/Solution/exercise_subview_solution.cpp +++ b/Exercises/subview/Solution/exercise_subview_solution.cpp @@ -65,26 +65,8 @@ int main( int argc, char* argv[] ) Kokkos::initialize( argc, argv ); { - #ifdef KOKKOS_ENABLE_CUDA - #define MemSpace Kokkos::CudaSpace - #define ExecSpace Kokkos::Cuda - #endif - #ifdef KOKKOS_ENABLE_HIP - #define MemSpace Kokkos::HIPSpace - #define ExecSpace Kokkos::HIP - #endif - #ifdef KOKKOS_ENABLE_THREADS - #define MemSpace Kokkos::HostSpace - #define ExecSpace Kokkos::Threads - #endif - - #ifndef MemSpace - #define MemSpace Kokkos::HostSpace - #endif - - #ifndef ExecSpace - #define ExecSpace Kokkos::Serial - #endif + using MemSpace = Kokkos::DefaultExecutionSpace::memory_space; + using ExecSpace = MemSpace::execution_space; // using Layout = Kokkos::LayoutLeft; using Layout = Kokkos::LayoutRight; diff --git a/Scripts/ci-run-solutions.sh b/Scripts/ci-run-solutions.sh index 2e38c349..109fe0f6 100644 --- a/Scripts/ci-run-solutions.sh +++ b/Scripts/ci-run-solutions.sh @@ -42,7 +42,6 @@ if [ "$backend" == CUDA ]; then fi if [ ! "$backend" == CUDA ]; then - SOLUTION_EXERCISES+=(tasking) SOLUTION_EXERCISES+=(virtualfunction) fi From b1570f649f9c94b68d1c4a738999b78c12f62be6 Mon Sep 17 00:00:00 2001 From: Adrien Taberner Date: Wed, 26 Feb 2025 10:42:54 +0100 Subject: [PATCH 3/5] ci-change --- .github/workflows/github-pr-unix.yaml | 2 +- Exercises/04/Solution/exercise_4_solution.cpp | 4 ++-- Exercises/parallel_scan/Begin/parallel_scan.cpp | 16 +--------------- .../subview/Begin/exercise_subview_begin.cpp | 2 -- .../Solution/exercise_subview_solution.cpp | 5 ++--- Scripts/ci-configure-build-test.bat | 1 - 6 files changed, 6 insertions(+), 24 deletions(-) diff --git a/.github/workflows/github-pr-unix.yaml b/.github/workflows/github-pr-unix.yaml index af896128..6734d1b7 100644 --- a/.github/workflows/github-pr-unix.yaml +++ b/.github/workflows/github-pr-unix.yaml @@ -3,7 +3,7 @@ name: Hosted Runners (unix) on: push: pull_request: - branches: [ "ci-test" ] + branches: [ "main" ] workflow_dispatch: jobs: diff --git a/Exercises/04/Solution/exercise_4_solution.cpp b/Exercises/04/Solution/exercise_4_solution.cpp index 694cd4ec..317f9253 100644 --- a/Exercises/04/Solution/exercise_4_solution.cpp +++ b/Exercises/04/Solution/exercise_4_solution.cpp @@ -66,8 +66,8 @@ int main( int argc, char* argv[] ) Kokkos::initialize( argc, argv ); { - using MemSpace = Kokkos::DefaultExecutionSpace::memory_space; - using ExecSpace = MemSpace::execution_space; + using ExecSpace = Kokkos::DefaultExecutionSpace; + using MemSpace = ExecSpace::memory_space; using range_policy = Kokkos::RangePolicy; // Allocate y, x vectors and Matrix A on device. diff --git a/Exercises/parallel_scan/Begin/parallel_scan.cpp b/Exercises/parallel_scan/Begin/parallel_scan.cpp index a0124a16..059c6bcb 100644 --- a/Exercises/parallel_scan/Begin/parallel_scan.cpp +++ b/Exercises/parallel_scan/Begin/parallel_scan.cpp @@ -18,21 +18,7 @@ #include template struct Factorial { - using value_type = ValueType; - - Factorial(Kokkos::View view) : m_view(view) {} - - // EXERCISE: Implement the init method - // void init(...) {...} - - // EXERCISE: Implement the join method - // void join(...) {...} - - // EXERCISE: Implement the operator() method - // void operator()(...) const {...} - -private: - Kokkos::View m_view; + /* EXERCISE */ }; int main(int argc, char *argv[]) { diff --git a/Exercises/subview/Begin/exercise_subview_begin.cpp b/Exercises/subview/Begin/exercise_subview_begin.cpp index 3a071701..d00af802 100644 --- a/Exercises/subview/Begin/exercise_subview_begin.cpp +++ b/Exercises/subview/Begin/exercise_subview_begin.cpp @@ -156,8 +156,6 @@ int main( int argc, char* argv[] ) } } - - // Calculate time. double time = timer.seconds(); diff --git a/Exercises/subview/Solution/exercise_subview_solution.cpp b/Exercises/subview/Solution/exercise_subview_solution.cpp index 406e9f7b..a4f031b1 100644 --- a/Exercises/subview/Solution/exercise_subview_solution.cpp +++ b/Exercises/subview/Solution/exercise_subview_solution.cpp @@ -65,8 +65,8 @@ int main( int argc, char* argv[] ) Kokkos::initialize( argc, argv ); { - using MemSpace = Kokkos::DefaultExecutionSpace::memory_space; - using ExecSpace = MemSpace::execution_space; + using ExecSpace = Kokkos::DefaultExecutionSpace; + using MemSpace = ExecSpace::memory_space; // using Layout = Kokkos::LayoutLeft; using Layout = Kokkos::LayoutRight; @@ -138,7 +138,6 @@ int main( int argc, char* argv[] ) } } - // Calculate time. double time = timer.seconds(); diff --git a/Scripts/ci-configure-build-test.bat b/Scripts/ci-configure-build-test.bat index 77846dfa..d8cccbb0 100644 --- a/Scripts/ci-configure-build-test.bat +++ b/Scripts/ci-configure-build-test.bat @@ -19,7 +19,6 @@ set EXERCISES=%EXERCISES% kokkoskernels/GaussSeidel set EXERCISES=%EXERCISES% kokkoskernels/GraphColoring set EXERCISES=%EXERCISES% kokkoskernels/InnerProduct set EXERCISES=%EXERCISES% mdrange -set EXERCISES=%EXERCISES% mpi_pack_unpack set EXERCISES=%EXERCISES% parallel_scan set EXERCISES=%EXERCISES% random_number set EXERCISES=%EXERCISES% scatter_view From 0d17e131f9edbf8f4cbb4ef23a7dff34b6a2781f Mon Sep 17 00:00:00 2001 From: Adrien Taberner Date: Fri, 28 Feb 2025 16:43:32 +0100 Subject: [PATCH 4/5] cout -> printf --- Exercises/unique_token/Begin/unique_token.cpp | 4 ++-- Exercises/unique_token/Solution/unique_token.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Exercises/unique_token/Begin/unique_token.cpp b/Exercises/unique_token/Begin/unique_token.cpp index 0af6becb..36ce9b9e 100644 --- a/Exercises/unique_token/Begin/unique_token.cpp +++ b/Exercises/unique_token/Begin/unique_token.cpp @@ -1,4 +1,4 @@ -#include +#include // EXERCISE: need to remove the ifdef... #ifdef KOKKOS_ENABLE_OPENMP @@ -89,7 +89,7 @@ int main(int argc, char* argv[]) { Kokkos::deep_copy(values,values_h); double time_dup = scatter_add_loop(values,results,D); - std::cout << "Time Duplicated: " << N << " " << M << " " << time_dup << std::endl; + printf("Time Duplicated: %d %d %lf\n",N,M,time_dup); } Kokkos::finalize(); diff --git a/Exercises/unique_token/Solution/unique_token.cpp b/Exercises/unique_token/Solution/unique_token.cpp index b13db287..5fba229a 100644 --- a/Exercises/unique_token/Solution/unique_token.cpp +++ b/Exercises/unique_token/Solution/unique_token.cpp @@ -1,4 +1,4 @@ -#include +#include using atomic_2d_view = Kokkos::View >; @@ -87,7 +87,7 @@ int main(int argc, char* argv[]) { Kokkos::deep_copy(values,values_h); double time_dup = scatter_add_loop(values,results,D); - std::cout << "Time Duplicated: " << N << " " << M << " " << time_dup << std::endl; + printf("Time Duplicated: %d %d %d %lf\n",N,M,D,time_dup); } Kokkos::finalize(); From 2c052178e1151f19e003bced08d818b4267af05f Mon Sep 17 00:00:00 2001 From: Adrien Taberner Date: Fri, 5 Sep 2025 16:25:55 +0200 Subject: [PATCH 5/5] Reset exercise simd Signed-off-by: Adrien Taberner --- Exercises/simd/Begin/simd_begin.cpp | 265 ++++++++-------------- Exercises/simd/Solution/simd_solution.cpp | 245 +++++++------------- 2 files changed, 168 insertions(+), 342 deletions(-) diff --git a/Exercises/simd/Begin/simd_begin.cpp b/Exercises/simd/Begin/simd_begin.cpp index afd10494..fc047715 100644 --- a/Exercises/simd/Begin/simd_begin.cpp +++ b/Exercises/simd/Begin/simd_begin.cpp @@ -1,199 +1,112 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@ - -// EXERCISE SIMD Goals -// Create views with the simd type -// Create unmanaged view to have a scalar acess -// Adapt the parallel reduce loop - -#include -#include -#include -#include -#include - -#include -// EXERCISE: Include Kokkos SIMD header file. -// #include - -#define MemSpace Kokkos::HostSpace -#define Layout Kokkos::LayoutRight - -using ExecSpace = MemSpace::execution_space; -using range_policy = Kokkos::RangePolicy; - -void checkSizes( int &N, int &M, int &S, int &nrepeat ); - -int main( int argc, char* argv[] ) -{ - int N = -1; // number of rows 2^12 - int M = -1; // number of columns 2^10 - int S = -1; // total size 2^22 - int nrepeat = 100; // number of repeats of the test - - // Read command line arguments. - for ( int i = 0; i < argc; i++ ) { - if ( ( strcmp( argv[ i ], "-N" ) == 0 ) || ( strcmp( argv[ i ], "-Rows" ) == 0 ) ) { - N = pow( 2, atoi( argv[ ++i ] ) ); - printf( " User N is %d\n", N ); - } - else if ( ( strcmp( argv[ i ], "-M" ) == 0 ) || ( strcmp( argv[ i ], "-Columns" ) == 0 ) ) { - M = pow( 2, atof( argv[ ++i ] ) ); - printf( " User M is %d\n", M ); - } - else if ( ( strcmp( argv[ i ], "-S" ) == 0 ) || ( strcmp( argv[ i ], "-Size" ) == 0 ) ) { - S = pow( 2, atof( argv[ ++i ] ) ); - printf( " User S is %d\n", S ); - } - else if ( strcmp( argv[ i ], "-nrepeat" ) == 0 ) { - nrepeat = atoi( argv[ ++i ] ); - } - else if ( ( strcmp( argv[ i ], "-h" ) == 0 ) || ( strcmp( argv[ i ], "-help" ) == 0 ) ) { - printf( " y^T*A*x Options:\n" ); - printf( " -Rows (-N) : exponent num, determines number of rows 2^num (default: 2^12 = 4096)\n" ); - printf( " -Columns (-M) : exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n" ); - printf( " -Size (-S) : exponent num, determines total matrix size 2^num (default: 2^22 = 4096*1024 )\n" ); - printf( " -nrepeat : number of repetitions (default: 100)\n" ); - printf( " -help (-h): print this message\n\n" ); - exit( 1 ); - } - } +#include +//EXERCISE: include the right header (later Kokkos will include this) +//#include - // Check sizes. - checkSizes( N, M, S, nrepeat ); +void test_simd(int N_in, int M, int R, double a) { - Kokkos::initialize( argc, argv ); - { - //EXERCISE: Get the right type here. + //EXERCISE: get the right type here //using simd_t = ...; - - //EXERCISE: What will be the limit of the M loop ? - int M_simd = M; - - // EXERCISE: Create SIMD Views with the right dimensions. - Kokkos::View x( "x", M); - Kokkos::View A( "A", N, M); - - // EXERCISE: Create unmanaged scalar views for x and A. - // x_scalar((double*)x.data(), M); - // A_scalar((double*)A.data(), N, M); - Kokkos::View y( "y", N); - - // Initialize y vector. - for ( int i = 0; i < N; ++i ) { - y( i ) = 1; - } - // EXERCISE: Initialize the scalar view of x vector. - for ( int i = 0; i < M; ++i ) { - x( i ) = 1; - } - // EXERCISE: Initialize the scalar view of A matrix. - for ( int j = 0; j < N; ++j ) { - for ( int i = 0; i < M; ++i ) { - A( j, i ) = 1; - } - } - - // Timer products. - Kokkos::Timer timer; - for ( int repeat = 0; repeat < nrepeat; repeat++ ) { - // Application: = y^T*A*x - double result = 0; + //EXERCISE: What will the N now be? + int N = N_in; + + //EXERCISE: create SIMD Views instead + Kokkos::View data("D",N,M); + Kokkos::View results("R",N); + + // EXERCISE: create correctly a scalar view of of data and results + // For the final reduction we gonna need a scalar view of the data for now + // Relying on knowing the data layout, we will add SIMD Layouts later + // so that simple copy construction/assignment would work + Kokkos::View data_scalar(data); + Kokkos::View results_scalar(results); - Kokkos::parallel_reduce( "yAx", range_policy( 0, N ), KOKKOS_LAMBDA ( int j, double &update ) { - // EXERCISE: What will be the type of temp2 ? - double temp2(0.0); - // EXERCISE: What will be the loop limit for SIMD data type ? - for ( int i = 0; i < M; ++i ) { - temp2 += A( j, i ) * x( i ); + // Lets fill the input data using scalar view + Kokkos::parallel_for("init",data_scalar.extent(0), KOKKOS_LAMBDA(const int i) { + for (int j=0; j data("D",N,M); + Kokkos::View results("R",N); -void checkSizes( int &N, int &M, int &S, int &nrepeat ) { - // If S is undefined and N or M is undefined, set S to 2^22 or the bigger of N and M. - if ( S == -1 && ( N == -1 || M == -1 ) ) { - S = pow( 2, 22 ); - if ( S < N ) S = N; - if ( S < M ) S = M; - } + // Lets fill the input data + Kokkos::parallel_for("init",data.extent(0), KOKKOS_LAMBDA(const int i) { + for (int j=0; j 1024 ) { - M = 1024; - } - else { - M = S; - } + Kokkos::Timer timer; + for(int r = 0; r1?atoi(argv[1]):320000; + int M = argc>2?atoi(argv[2]):3; + int R = argc>3?atoi(argv[3]):10; + double scal = argc>4?atof(argv[4]):1.5; + + if(N%32) { + printf("Please choose an N dividable by 32\n"); + return 0; } + + test_scalar(N,M,R,scal); + test_simd(N,M,R,scal); + + Kokkos::finalize(); } diff --git a/Exercises/simd/Solution/simd_solution.cpp b/Exercises/simd/Solution/simd_solution.cpp index 919cc285..26da7b0f 100644 --- a/Exercises/simd/Solution/simd_solution.cpp +++ b/Exercises/simd/Solution/simd_solution.cpp @@ -1,192 +1,105 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include -#include -#include -#include - -#include -#include - -#define MemSpace Kokkos::HostSpace -#define Layout Kokkos::LayoutRight - -using ExecSpace = MemSpace::execution_space; -using range_policy = Kokkos::RangePolicy; - -void checkSizes( int &N, int &M, int &S, int &nrepeat ); - -int main( int argc, char* argv[] ) -{ - int N = -1; // number of rows 2^12 - int M = -1; // number of columns 2^10 - int S = -1; // total size 2^22 - int nrepeat = 100; // number of repeats of the test - - // Read command line arguments. - for ( int i = 0; i < argc; i++ ) { - if ( ( strcmp( argv[ i ], "-N" ) == 0 ) || ( strcmp( argv[ i ], "-Rows" ) == 0 ) ) { - N = pow( 2, atoi( argv[ ++i ] ) ); - printf( " User N is %d\n", N ); - } - else if ( ( strcmp( argv[ i ], "-M" ) == 0 ) || ( strcmp( argv[ i ], "-Columns" ) == 0 ) ) { - M = pow( 2, atof( argv[ ++i ] ) ); - printf( " User M is %d\n", M ); - } - else if ( ( strcmp( argv[ i ], "-S" ) == 0 ) || ( strcmp( argv[ i ], "-Size" ) == 0 ) ) { - S = pow( 2, atof( argv[ ++i ] ) ); - printf( " User S is %d\n", S ); - } - else if ( strcmp( argv[ i ], "-nrepeat" ) == 0 ) { - nrepeat = atoi( argv[ ++i ] ); - } - else if ( ( strcmp( argv[ i ], "-h" ) == 0 ) || ( strcmp( argv[ i ], "-help" ) == 0 ) ) { - printf( " y^T*A*x Options:\n" ); - printf( " -Rows (-N) : exponent num, determines number of rows 2^num (default: 2^12 = 4096)\n" ); - printf( " -Columns (-M) : exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n" ); - printf( " -Size (-S) : exponent num, determines total matrix size 2^num (default: 2^22 = 4096*1024 )\n" ); - printf( " -nrepeat : number of repetitions (default: 100)\n" ); - printf( " -help (-h): print this message\n\n" ); - exit( 1 ); - } - } +#include +#include - // Check sizes. - checkSizes( N, M, S, nrepeat ); +void test_simd(int N_in, int M, int R, double a) { - Kokkos::initialize( argc, argv ); - { - // Use SIMD with double precision. using simd_t = Kokkos::Experimental::native_simd; - // Compute the new loop limit for SIMD data type. - int M_simd = M / simd_t::size(); - // Create simd views. - Kokkos::View x( "x", M_simd);; - Kokkos::View A( "A", N, M_simd); + int N = N_in/simd_t::size(); - // Unmanaged views for initialization. - Kokkos::View x_scalar((double*)x.data(), M); - Kokkos::View A_scalar((double*)A.data(), N, M); - Kokkos::View y( "y", N); + Kokkos::View data("D",N,M); + Kokkos::View results("R",N); - // Initialize y vector. - for ( int i = 0; i < N; ++i ) { - y( i ) = 1; - } - // Initialize x vector. - for ( int i = 0; i < M; ++i ) { - x_scalar( i ) = 1; - } - // Initialize A matrix. - for ( int j = 0; j < N; ++j ) { - for ( int i = 0; i < M; ++i ) { - A_scalar( j, i ) = 1; - } - } + // For the final reduction we gonna need a scalar view of the data for now + // Relying on knowing the data layout, we will add SIMD Layouts later + // so that simple copy construction/assignment would work + Kokkos::View data_scalar((double*)data.data(),N_in,M); + Kokkos::View results_scalar((double*)results.data(),N_in); - // Timer products. - Kokkos::Timer timer; + // Lets fill the input data using scalar view + Kokkos::parallel_for("init",data_scalar.extent(0), KOKKOS_LAMBDA(const int i) { + for (int j=0; j = y^T*A*x - double result = 0; - - Kokkos::parallel_reduce( "yAx", range_policy( 0, N ), KOKKOS_LAMBDA ( int j, double &update ) { - simd_t temp2(0.0); - for ( int i = 0; i < M_simd; ++i ) { - temp2 += A( j, i ) * x( i ); - } - // Accumulate the result for each SIMD lane. - for ( std::size_t i = 0; i < simd_t::size(); ++i) { - update += y(j) * temp2[i]; + Kokkos::Timer timer; + for(int r = 0; r data("D",N,M); + Kokkos::View results("R",N); -void checkSizes( int &N, int &M, int &S, int &nrepeat ) { - // If S is undefined and N or M is undefined, set S to 2^22 or the bigger of N and M. - if ( S == -1 && ( N == -1 || M == -1 ) ) { - S = pow( 2, 22 ); - if ( S < N ) S = N; - if ( S < M ) S = M; - } + // Lets fill the input data + Kokkos::parallel_for("init",data.extent(0), KOKKOS_LAMBDA(const int i) { + for (int j=0; j 1024 ) { - M = 1024; - } - else { - M = S; - } + Kokkos::Timer timer; + for(int r = 0; r1?atoi(argv[1]):320000; + int M = argc>2?atoi(argv[2]):3; + int R = argc>3?atoi(argv[3]):10; + double scal = argc>4?atof(argv[4]):1.5; + + if(N%32) { + printf("Please choose an N dividable by 32\n"); + return 0; } + + test_scalar(N,M,R,scal); + test_simd(N,M,R,scal); + + Kokkos::finalize(); }