diff --git a/.github/workflows/self-hosted.yml b/.github/workflows/self-hosted.yml
new file mode 100644
index 0000000..fdd39fe
--- /dev/null
+++ b/.github/workflows/self-hosted.yml
@@ -0,0 +1,107 @@
+name: Self-Hosted
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+  workflow_dispatch:
+
+env:
+  build_dir: "build"
+
+jobs:
+  build:
+    name: Build ${{ matrix.os }} GCC ${{ matrix.gcc }} CUDA ${{ matrix.cuda }}
+    runs-on: self-hosted
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: ubuntu-24.04
+            cuda: "12.8"
+            gcc: 13
+    env:
+      config: "Release"
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Set environment variables
+        run: |
+          echo "CUDA_PATH=/usr/local/cuda-12.8" >> $GITHUB_ENV
+          echo "${CUDA_PATH}/bin" >> $GITHUB_PATH
+          echo "LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
+          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> $GITHUB_ENV
+          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> $GITHUB_ENV
+          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> $GITHUB_ENV
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> $GITHUB_ENV
+
+      - name: Configure CMake build
+        run: |
+          cmake . -B ${{ env.build_dir }} -DCMAKE_BUILD_TYPE=${{ env.config }} -DCUBOOL_BUILD_TESTS=ON
+
+      - name: Build library sources
+        run: |
+          cmake --build ${{ env.build_dir }} --target all --verbose -j `nproc`
+
+  test:
+    name: Test GPU ${{ matrix.gpu }} CUDA ${{ matrix.cuda }}
+    needs: build
+    runs-on: self-hosted
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - gpu: NVIDIA-GeForce-GT-1030
+            cuda: "12.9"
+    env:
+      unit-test-file: gpu_test_all.log
+      regression-test-file: gpu_test_regression.log
+
+    steps:
+      - name: Run unit-tests
+        working-directory: ${{ env.build_dir }}
+        run: |
+          bash scripts/run_tests_all.sh | tee ${{ env.unit-test-file }}
+
+      - name: Upload unit tests resutls
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.unit-test-file }}
+          path: ${{ env.build_dir }}/${{ env.unit-test-file }}
+
+      - name: Check for unit tests results
+        working-directory: ${{ env.build_dir }}
+        run: |
+          ! grep -q "FAILED" ${{ env.unit-test-file }}
+
+      - name: Run regression-tests
+        working-directory: ${{ env.build_dir }}/python
+        run: |
+          bash run_tests.sh 2>&1 | tee ${{ env.regression-test-file }}
+
+      - name: Upload regression tests resutls
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.regression-test-file }}
+          path: ${{ env.build_dir }}/python/${{ env.regression-test-file }}
+
+      - name: Check for regression tests results
+        working-directory: ${{ env.build_dir }}/python
+        run: |
+          ! grep -q "FAILED" ${{ env.regression-test-file }}
+
+  clean:
+    name: Cleanup workspace
+    needs: test
+    if: always()
+    runs-on: self-hosted
+
+    steps:
+      - name: Cleanup workspace
+        run: |
+          rm -rf ${{ github.workspace }}/*
+          rm -rf ${{ github.workspace }}/.*
diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
index cd33dd3..bf22ee3 100644
--- a/.github/workflows/ubuntu.yml
+++ b/.github/workflows/ubuntu.yml
@@ -1,5 +1,3 @@
-# Original script from https://github.com/ptheywood/cuda-cmake-github-actions
-
 name: Ubuntu
 
 on:
@@ -9,74 +7,97 @@ on:
     branches: [ master ]
   workflow_dispatch:
 
+env:
+  build_dir: "build"
+  artifact: "cubool-ubuntu-build.tar.xz"
+
 jobs:
   build:
-    name: Build ${{ matrix.os }} GCC ${{ matrix.gcc }} CUDA ${{ matrix.cuda }}
+    name: Build ${{ matrix.os }}
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
         include:
-          - os: ubuntu-18.04
-            cuda: "10.1"
-            gcc: 8
+          - os: ubuntu-22.04
     env:
-      build_dir: "build"
       config: "Release"
-      artifact: "cubool-ubuntu-build.tar.xz"
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           submodules: true
-      - uses: actions/setup-python@v2
-        with:
-          python-version: '3.7'
 
-      - name: Install CUDA
-        env:
-          cuda: ${{ matrix.cuda }}
+      - name: Configure CMake build
         run: |
-          source ./scripts/install_cuda_ubuntu.sh
-          if [[ $? -eq 0 ]]; then
-            # Set paths for subsequent steps, using ${CUDA_PATH}
-            echo "Adding CUDA to CUDA_PATH, PATH and LD_LIBRARY_PATH"
-            echo "CUDA_PATH=${CUDA_PATH}" >> $GITHUB_ENV
-            echo "${CUDA_PATH}/bin" >> $GITHUB_PATH
-            echo "LD_LIBRARY_PATH=${CUDA_PATH}/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
-          fi
-        shell: bash
-
-      - name: Install and configure GCC and GXX
+          cmake . -B ${{ env.build_dir }} -DCMAKE_BUILD_TYPE=${{ env.config }} -DCUBOOL_BUILD_TESTS=ON -DCUBOOL_WITH_CUDA=OFF
+
+      - name: Build library sources
         run: |
-          sudo apt-get install -y gcc-${{ matrix.gcc }} g++-${{ matrix.gcc }}
-          echo "СС=/usr/bin/gcc-${{ matrix.gcc }}" >> $GITHUB_ENV
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> $GITHUB_ENV
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> $GITHUB_ENV
+          cmake --build ${{ env.build_dir }} --target all --verbose -j `nproc`
 
-      - name: Configure CMake build
-        run: cmake . -B ${{ env.build_dir }} -DCMAKE_BUILD_TYPE=${{ env.config }} -DCUBOOL_BUILD_TESTS=YES
+      - name: Prepare upload binary
+        run: |
+          tar cfz ${{ env.artifact }} ${{ env.build_dir }}
 
-      - name: Build library sources
+      - name: Upload binary
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.artifact }}
+          path: ${{ env.artifact }}
+
+  test:
+    name: Test CPU ${{ matrix.cpu }}
+    needs: build
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cpu: AMD-EPYC-7763
+    env:
+      unit-test-file: cpu_test_all.log
+      regression-test-file: cpu_test_regression.log
+
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          name: ${{ env.artifact }}
+
+      - name: Unarchive artifact
+        run: |
+          tar xzf ${{ env.artifact }}
+          rm ${{ env.artifact }}
+
+      - name: Run unit-tests
         working-directory: ${{ env.build_dir }}
-        run: cmake --build . --target all --verbose -j `nproc`
+        run: |
+          bash scripts/run_tests_all.sh | tee ${{ env.unit-test-file }}
+
+      - name: Upload unit tests resutls
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.unit-test-file }}
+          path: ${{ env.build_dir }}/${{ env.unit-test-file }}
 
-      - name: Run unit-tests (sequential backend)
+      - name: Check for unit tests results
         working-directory: ${{ env.build_dir }}
-        run: bash scripts/run_tests_fallback.sh
-        shell: bash
+        run: |
+          ! grep -q "FAILED" ${{ env.unit-test-file }}
 
-      - name: Run regression-tests (sequential backend)
+      - name: Run regression-tests
         working-directory: ${{ env.build_dir }}/python
-        run: bash run_tests.sh
-        shell: bash
-
-      - name: Prepare upload binary
-        shell: bash
-        run: tar cfz ${{ env.artifact }} ${{ env.build_dir }}
+        run: |
+          bash run_tests.sh 2>&1 | tee ${{ env.regression-test-file }}
 
-      - name: Upload binary
-        uses: actions/upload-artifact@v2
+      - name: Upload regression tests resutls
+        uses: actions/upload-artifact@v4
         with:
-          name: ${{ env.artifact }}
-          path: ${{ env.artifact }}
\ No newline at end of file
+          name: ${{ env.regression-test-file }}
+          path: ${{ env.build_dir }}/python/${{ env.regression-test-file }}
+
+      - name: Check for regression tests results
+        working-directory: ${{ env.build_dir }}/python
+        run: |
+          ! grep -q "FAILED" ${{ env.regression-test-file }}
diff --git a/.gitmodules b/.gitmodules
index 3adb3ec..f98d82d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,3 @@
 [submodule "deps/gtest"]
 	path = deps/gtest
 	url = https://github.com/google/googletest.git
-[submodule "deps/cub"]
-	path = deps/cub
-	url = https://github.com/NVIDIA/cub.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb9212a..a6404f7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,8 @@ option(CUBOOL_WITH_NAIVE         "Build library with naive and naive-shared dens
 option(CUBOOL_BUILD_TESTS        "Build project unit-tests with gtest" ON)
 option(CUBOOL_COPY_TO_PY_PACKAGE "Copy compiled shared library into python package folder (for package use purposes)" ON)
 
+option(CUBOOL_USE_NSPARSE_MERGE_FUNCTOR "Use nsparse optimiztion for matrix addition function" ON)
+
 set(CUBOOL_VERSION_MAJOR 1)
 set(CUBOOL_VERSION_MINOR 0)
 set(CUBOOL_VERSION_SUB 0)
@@ -32,14 +34,6 @@ endif()
 
 # Configure cuda dependencies
 if (CUBOOL_WITH_CUDA)
-    message(STATUS "Add cub as cuda utility")
-    set(CUB_ENABLE_HEADER_TESTING OFF CACHE BOOL "" FORCE)
-    set(CUB_ENABLE_TESTING OFF CACHE BOOL "" FORCE)
-    set(CUB_ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE)
-    add_subdirectory(deps/cub)
-    add_library(cub INTERFACE IMPORTED)
-    target_link_libraries(cub INTERFACE CUB::CUB)
-
     message(STATUS "Add nsparse library as crs matrix multiplication backend")
     add_subdirectory(deps/nsparse-um)
 endif()
@@ -61,4 +55,4 @@ add_subdirectory(cubool)
 file(COPY scripts DESTINATION ${CMAKE_BINARY_DIR}/)
 
 # Copy python related stuff
-file(COPY python DESTINATION ${CMAKE_BINARY_DIR}/)
\ No newline at end of file
+file(COPY python DESTINATION ${CMAKE_BINARY_DIR}/)
diff --git a/README.md b/README.md
index 160a767..e0bcb23 100644
--- a/README.md
+++ b/README.md
@@ -214,7 +214,7 @@ $ bash ./scripts/run_tests_all.sh
 By default, the following cmake options will be automatically enabled:
 
 - `CUBOOL_WITH_CUDA` - build library with actual cuda backend
-- `CUBOOL_WITH_SEQUENTIAL` - build library witt cpu based backend
+- `CUBOOL_WITH_SEQUENTIAL` - build library with cpu based backend
 - `CUBOOL_WITH_TESTS` - build library unit-tests collection
 
 > Note: in order to provide correct GCC version for CUDA sources compiling,
diff --git a/cubool/CMakeLists.txt b/cubool/CMakeLists.txt
index 092504b..ed3cb41 100644
--- a/cubool/CMakeLists.txt
+++ b/cubool/CMakeLists.txt
@@ -87,6 +87,7 @@ set(CUBOOL_C_API_SOURCES
     sources/cuBool_Matrix_Reduce2.cpp
     sources/cuBool_Matrix_EWiseAdd.cpp
     sources/cuBool_Matrix_EWiseMult.cpp
+    sources/cuBool_Matrix_EWiseMultInverted.cpp
     sources/cuBool_Vector_New.cpp
     sources/cuBool_Vector_Build.cpp
     sources/cuBool_Vector_SetElement.cpp
@@ -125,6 +126,7 @@ if (CUBOOL_WITH_CUDA)
         sources/cuda/cuda_matrix.cu
         sources/cuda/cuda_matrix_ewiseadd.cu
         sources/cuda/cuda_matrix_ewisemult.cu
+        sources/cuda/cuda_matrix_ewisemult_inverted.cu
         sources/cuda/cuda_matrix_kronecker.cu
         sources/cuda/cuda_matrix_multiply.cu
         sources/cuda/cuda_matrix_transpose.cu
@@ -147,6 +149,7 @@ if (CUBOOL_WITH_CUDA)
         sources/cuda/kernels/spgemv_t.cuh
         sources/cuda/kernels/spewiseadd.cuh
         sources/cuda/kernels/spewisemult.cuh
+        sources/cuda/kernels/spewisemultinverted.cuh
         sources/cuda/kernels/sptranspose.cuh
         sources/cuda/kernels/sptranspose2.cuh
         sources/cuda/kernels/spkron.cuh
@@ -173,6 +176,8 @@ if (CUBOOL_WITH_SEQUENTIAL)
         sources/sequential/sq_ewiseadd.hpp
         sources/sequential/sq_ewisemult.cpp
         sources/sequential/sq_ewisemult.hpp
+        sources/sequential/sq_ewisemultinverted.cpp
+        sources/sequential/sq_ewisemultinverted.hpp
         sources/sequential/sq_spgemm.cpp
         sources/sequential/sq_spgemm.hpp
         sources/sequential/sq_spgemv.cpp
@@ -201,11 +206,9 @@ target_compile_definitions(cubool PRIVATE CUBOOL_VERSION_MAJOR=${CUBOOL_VERSION_
 target_compile_definitions(cubool PRIVATE CUBOOL_VERSION_MINOR=${CUBOOL_VERSION_MINOR})
 target_compile_definitions(cubool PRIVATE CUBOOL_VERSION_SUB=${CUBOOL_VERSION_SUB})
 
-target_compile_features(cubool PUBLIC cxx_std_14)
+target_compile_definitions(cubool PRIVATE CUBOOL_USE_NSPARSE_MERGE_FUNCTOR=$<BOOL:${CUBOOL_USE_NSPARSE_MERGE_FUNCTOR}>)
 
-target_compile_options(cubool PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -Wall>)
-target_compile_options(cubool PRIVATE $<$<AND:$<CONFIG:Debug>,$<COMPILE_LANGUAGE:CXX>>: -O2>)
-target_compile_options(cubool PRIVATE $<$<AND:$<CONFIG:Release>,$<COMPILE_LANGUAGE:CXX>>: -O0>)
+target_compile_features(cubool PUBLIC cxx_std_17)
 
 set_target_properties(cubool PROPERTIES CXX_STANDARD 17)
 set_target_properties(cubool PROPERTIES CXX_STANDARD_REQUIRED ON)
@@ -219,7 +222,6 @@ endforeach()
 if (CUBOOL_WITH_CUDA)
     set_target_properties(cubool PROPERTIES CUDA_STANDARD 14)
     set_target_properties(cubool PROPERTIES CUDA_STANDARD_REQUIRED ON)
-    set_target_properties(cubool PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
     # Settings: https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
 
@@ -256,4 +258,4 @@ if (CUBOOL_COPY_TO_PY_PACKAGE)
         "${CMAKE_BINARY_DIR}/cubool/${LIBRARY_FILE_NAME}"
         "${CMAKE_BINARY_DIR}/python/pycubool"
         COMMENT "Copy ${LIBRARY_FILE_NAME} compiled lib into python folder")
-endif()
\ No newline at end of file
+endif()
diff --git a/cubool/include/cubool/cubool.h b/cubool/include/cubool/cubool.h
index cd16bb3..b5f7885 100644
--- a/cubool/include/cubool/cubool.h
+++ b/cubool/include/cubool/cubool.h
@@ -925,4 +925,30 @@ CUBOOL_EXPORT CUBOOL_API cuBool_Status cuBool_Kronecker(
     cuBool_Hints hints
 );
 
+/**
+ * Performs result = matrix * ~mask, where
+ *     '*' is boolean semiring 'and' operation
+ *     '~' is operation for invert matrix (0 swaps to 1 and 1 to 0)
+ *
+ * @note To perform this operation matrices must be compatible
+ *          dim(matrix) = M x T
+ *          dim(mask) = T x N
+ *          dim(result) = M x N
+ *
+ * @note Pass `CUBOOL_HINT_TIME_CHECK` hint to measure operation time
+ *
+ * @param result[out] Destination matrix to store result
+ * @param left Source matrix to be multiplied
+ * @param right Source matrix to be inverted and multiplied
+ * @param hints Hints for the operation
+ *
+ * @return Error code on this operation
+ */
+CUBOOL_EXPORT CUBOOL_API cuBool_Status cuBool_Matrix_EWiseMulInverted(
+    cuBool_Matrix result,
+    cuBool_Matrix matrix,
+    cuBool_Matrix mask,
+    cuBool_Hints hints
+);
+
 #endif //CUBOOL_CUBOOL_H
diff --git a/cubool/sources/backend/matrix_base.hpp b/cubool/sources/backend/matrix_base.hpp
index 5332031..8842eb9 100644
--- a/cubool/sources/backend/matrix_base.hpp
+++ b/cubool/sources/backend/matrix_base.hpp
@@ -49,6 +49,7 @@ namespace cubool {
         virtual void kronecker(const MatrixBase &aBase, const MatrixBase &bBase, bool checkTime) = 0;
         virtual void eWiseAdd(const MatrixBase &aBase, const MatrixBase &bBase, bool checkTime) = 0;
         virtual void eWiseMult(const MatrixBase &aBase, const MatrixBase &bBase, bool checkTime) = 0;
+        virtual void eWiseMultInverted(const MatrixBase &matrix, const MatrixBase &mask, bool checkTime) = 0;
 
         virtual index getNrows() const = 0;
         virtual index getNcols() const = 0;
@@ -59,4 +60,4 @@ namespace cubool {
 
 }
 
-#endif //CUBOOL_MATRIX_BASE_HPP
\ No newline at end of file
+#endif //CUBOOL_MATRIX_BASE_HPP
diff --git a/cubool/sources/core/matrix.cpp b/cubool/sources/core/matrix.cpp
index 571b010..49e35c3 100644
--- a/cubool/sources/core/matrix.cpp
+++ b/cubool/sources/core/matrix.cpp
@@ -349,6 +349,46 @@ namespace cubool {
         mHnd->eWiseMult(*a->mHnd, *b->mHnd, false);
     }
 
+    void Matrix::eWiseMultInverted(const MatrixBase &matrix, const MatrixBase &mask, bool checkTime) {
+
+        const auto* mat = dynamic_cast<const Matrix*>(&matrix);
+        const auto* msk = dynamic_cast<const Matrix*>(&mask);
+
+        CHECK_RAISE_ERROR(mat != nullptr, InvalidArgument, "Passed matrix does not belong to core matrix class");
+        CHECK_RAISE_ERROR(msk != nullptr, InvalidArgument, "Passed matrix does not belong to core matrix class");
+
+        index M = mat->getNrows();
+        index N = msk->getNcols();
+
+        CHECK_RAISE_ERROR(M == msk->getNrows(), InvalidArgument, "Passed matrices have incompatible size");
+        CHECK_RAISE_ERROR(N == msk->getNcols(), InvalidArgument, "Passed matrices have incompatible size");
+
+        CHECK_RAISE_ERROR(M == this->getNrows(), InvalidArgument, "Matrix has incompatible size for operation result");
+        CHECK_RAISE_ERROR(N == this->getNcols(), InvalidArgument, "Matrix has incompatible size for operation result");
+
+        mat->commitCache();
+        msk->commitCache();
+        this->releaseCache();
+
+        if (checkTime) {
+            TIMER_ACTION(timer, mHnd->eWiseMultInverted(*mat->mHnd, *msk->mHnd, false));
+
+            LogStream stream(*Library::getLogger());
+            stream << Logger::Level::Info
+                   << "Time: " << timer.getElapsedTimeMs() << " ms "
+                   << "Matrix::eWiseMultInverted: "
+                   << this->getDebugMarker() << " = "
+                   << mat->getDebugMarker() << " + "
+                   << msk->getDebugMarker() << LogStream::cmt;
+
+            return;
+        }
+
+        mHnd->eWiseMultInverted(*mat->mHnd, *msk->mHnd, false);
+    }
+
+
+
     index Matrix::getNrows() const {
         return mHnd->getNrows();
     }
@@ -396,4 +436,4 @@ namespace cubool {
         // Clear arrays
         releaseCache();
     }
-}
\ No newline at end of file
+}
diff --git a/cubool/sources/core/matrix.hpp b/cubool/sources/core/matrix.hpp
index 20cb890..6c4d38b 100644
--- a/cubool/sources/core/matrix.hpp
+++ b/cubool/sources/core/matrix.hpp
@@ -56,6 +56,7 @@ namespace cubool {
         void kronecker(const MatrixBase &aBase, const MatrixBase &bBase, bool checkTime) override;
         void eWiseAdd(const MatrixBase &aBase, const MatrixBase &bBase, bool checkTime) override;
         void eWiseMult(const MatrixBase &a, const MatrixBase &b, bool checkTime) override;
+        void eWiseMultInverted(const MatrixBase &matrix, const MatrixBase &mask, bool checkTime) override;
 
         index getNrows() const override;
         index getNcols() const override;
@@ -77,4 +78,4 @@ namespace cubool {
 
 }
 
-#endif //CUBOOL_MATRIX_HPP
\ No newline at end of file
+#endif //CUBOOL_MATRIX_HPP
diff --git a/cubool/sources/cuBool_Matrix_EWiseMultInverted.cpp b/cubool/sources/cuBool_Matrix_EWiseMultInverted.cpp
new file mode 100644
index 0000000..37fe607
--- /dev/null
+++ b/cubool/sources/cuBool_Matrix_EWiseMultInverted.cpp
@@ -0,0 +1,44 @@
+/**********************************************************************************/
+/* MIT License                                                                    */
+/*                                                                                */
+/* Copyright (c) 2020, 2021 JetBrains-Research                                    */
+/*                                                                                */
+/* Permission is hereby granted, free of charge, to any person obtaining a copy   */
+/* of this software and associated documentation files (the "Software"), to deal  */
+/* in the Software without restriction, including without limitation the rights   */
+/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell      */
+/* copies of the Software, and to permit persons to whom the Software is          */
+/* furnished to do so, subject to the following conditions:                       */
+/*                                                                                */
+/* The above copyright notice and this permission notice shall be included in all */
+/* copies or substantial portions of the Software.                                */
+/*                                                                                */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR     */
+/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,       */
+/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE    */
+/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER         */
+/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  */
+/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  */
+/* SOFTWARE.                                                                      */
+/**********************************************************************************/
+
+#include <cuBool_Common.hpp>
+
+CUBOOL_EXPORT CUBOOL_API cuBool_Status cuBool_Matrix_EWiseMulInverted(
+    cuBool_Matrix result,
+    cuBool_Matrix matrix,
+    cuBool_Matrix mask,
+    cuBool_Hints hints
+) {
+    CUBOOL_BEGIN_BODY
+        CUBOOL_VALIDATE_LIBRARY
+        CUBOOL_ARG_NOT_NULL(result)
+        CUBOOL_ARG_NOT_NULL(matrix)
+        CUBOOL_ARG_NOT_NULL(mask)
+        auto resultM = (cubool::Matrix *) result;
+        auto matrixM = (cubool::Matrix *) matrix;
+        auto maskM = (cubool::Matrix *) mask;
+
+        resultM->eWiseMultInverted(*matrixM, *maskM, hints & CUBOOL_HINT_TIME_CHECK);
+    CUBOOL_END_BODY
+}
diff --git a/cubool/sources/cuda/cuda_matrix.hpp b/cubool/sources/cuda/cuda_matrix.hpp
index e19fec1..9f1b7a2 100644
--- a/cubool/sources/cuda/cuda_matrix.hpp
+++ b/cubool/sources/cuda/cuda_matrix.hpp
@@ -56,6 +56,7 @@ namespace cubool {
         void kronecker(const MatrixBase &a, const MatrixBase &b, bool checkTime) override;
         void eWiseAdd(const MatrixBase &a, const MatrixBase &b, bool checkTime) override;
         void eWiseMult(const MatrixBase &a, const MatrixBase &b, bool checkTime) override;
+        void eWiseMultInverted(const MatrixBase &a, const MatrixBase &b, bool checkTime) override;
 
         index getNrows() const override;
         index getNcols() const override;
@@ -79,4 +80,4 @@ namespace cubool {
     };
 };
 
-#endif //CUBOOL_CUDA_MATRIX_HPP
\ No newline at end of file
+#endif //CUBOOL_CUDA_MATRIX_HPP
diff --git a/cubool/sources/cuda/cuda_matrix_ewisemult_inverted.cu b/cubool/sources/cuda/cuda_matrix_ewisemult_inverted.cu
new file mode 100644
index 0000000..2e0fa87
--- /dev/null
+++ b/cubool/sources/cuda/cuda_matrix_ewisemult_inverted.cu
@@ -0,0 +1,63 @@
+/**********************************************************************************/
+/* MIT License                                                                    */
+/*                                                                                */
+/* Copyright (c) 2020, 2021 JetBrains-Research                                    */
+/*                                                                                */
+/* Permission is hereby granted, free of charge, to any person obtaining a copy   */
+/* of this software and associated documentation files (the "Software"), to deal  */
+/* in the Software without restriction, including without limitation the rights   */
+/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell      */
+/* copies of the Software, and to permit persons to whom the Software is          */
+/* furnished to do so, subject to the following conditions:                       */
+/*                                                                                */
+/* The above copyright notice and this permission notice shall be included in all */
+/* copies or substantial portions of the Software.                                */
+/*                                                                                */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR     */
+/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,       */
+/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE    */
+/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER         */
+/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  */
+/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  */
+/* SOFTWARE.                                                                      */
+/**********************************************************************************/
+
+#include <cuda/cuda_matrix.hpp>
+#include <cuda/kernels/spewisemultinverted.cuh>
+
+namespace cubool {
+
+    void CudaMatrix::eWiseMultInverted(const MatrixBase &aBase, const MatrixBase &bBase, bool checkTime) {
+        auto a = dynamic_cast<const CudaMatrix*>(&aBase);
+        auto b = dynamic_cast<const CudaMatrix*>(&bBase);
+
+        CHECK_RAISE_ERROR(a != nullptr, InvalidArgument, "Passed matrix does not belong to csr matrix class");
+        CHECK_RAISE_ERROR(b != nullptr, InvalidArgument, "Passed matrix does not belong to csr matrix class");
+
+        index M = this->getNrows();
+        index N = this->getNcols();
+
+
+        assert(a->getNrows() == M);
+        assert(a->getNcols() == N);
+
+        assert(b->getNrows() == M);
+        assert(b->getNcols() == N);
+
+        if (a->isMatrixEmpty() || b->isMatrixEmpty()) {
+            this->clearAndResizeStorageToDim();
+            return;
+        }
+
+        // Ensure csr proper csr format even if empty
+        a->resizeStorageToDim();
+        b->resizeStorageToDim();
+
+        kernels::SpVectorEWiseMultInverted<index, DeviceAlloc<index>> spFunctor;
+        auto result = spFunctor(a->mMatrixImpl, b->mMatrixImpl);
+
+        // Assign the actual impl result to this storage
+        this->mMatrixImpl = std::move(result);
+    }
+
+}
diff --git a/cubool/sources/cuda/cuda_vector.cu b/cubool/sources/cuda/cuda_vector.cu
index ced56a0..f2fa9b0 100644
--- a/cubool/sources/cuda/cuda_vector.cu
+++ b/cubool/sources/cuda/cuda_vector.cu
@@ -29,6 +29,8 @@
 #include <utils/data_utils.hpp>
 #include <limits>
 
+#include <thrust/sort.h>
+
 namespace cubool {
 
     CudaVector::CudaVector(size_t nrows, CudaInstance &instance)
diff --git a/cubool/sources/cuda/kernels/spewiseadd.cuh b/cubool/sources/cuda/kernels/spewiseadd.cuh
index f6ad892..d1e1e87 100644
--- a/cubool/sources/cuda/kernels/spewiseadd.cuh
+++ b/cubool/sources/cuda/kernels/spewiseadd.cuh
@@ -27,6 +27,8 @@
 
 #include <cuda/details/sp_vector.hpp>
 
+#include <thrust/set_operations.h>
+
 namespace cubool {
     namespace kernels {
 
diff --git a/cubool/sources/cuda/kernels/spewisemult.cuh b/cubool/sources/cuda/kernels/spewisemult.cuh
index a4705de..6d17a46 100644
--- a/cubool/sources/cuda/kernels/spewisemult.cuh
+++ b/cubool/sources/cuda/kernels/spewisemult.cuh
@@ -30,6 +30,8 @@
 #include <nsparse/matrix.h>
 #include <cmath>
 
+#include <thrust/set_operations.h>
+
 namespace cubool {
     namespace kernels {
 
diff --git a/cubool/sources/cuda/kernels/spewisemultinverted.cuh b/cubool/sources/cuda/kernels/spewisemultinverted.cuh
new file mode 100644
index 0000000..353bc97
--- /dev/null
+++ b/cubool/sources/cuda/kernels/spewisemultinverted.cuh
@@ -0,0 +1,106 @@
+/**********************************************************************************/
+/* MIT License                                                                    */
+/*                                                                                */
+/* Copyright (c) 2020, 2021 JetBrains-Research                                    */
+/*                                                                                */
+/* Permission is hereby granted, free of charge, to any person obtaining a copy   */
+/* of this software and associated documentation files (the "Software"), to deal  */
+/* in the Software without restriction, including without limitation the rights   */
+/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell      */
+/* copies of the Software, and to permit persons to whom the Software is          */
+/* furnished to do so, subject to the following conditions:                       */
+/*                                                                                */
+/* The above copyright notice and this permission notice shall be included in all */
+/* copies or substantial portions of the Software.                                */
+/*                                                                                */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR     */
+/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,       */
+/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE    */
+/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER         */
+/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  */
+/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  */
+/* SOFTWARE.                                                                      */
+/**********************************************************************************/
+
+#ifndef CUBOOL_SPEWISEMULT_CUH
+#define CUBOOL_SPEWISEMULT_CUH
+
+#include <cuda/details/sp_vector.hpp>
+#include <cuda/kernels/bin_search.cuh>
+#include <nsparse/matrix.h>
+#include <cmath>
+
+#include <thrust/set_operations.h>
+
+namespace cubool {
+    namespace kernels {
+
+        template <typename IndexType, typename AllocType>
+        struct SpVectorEWiseMultInverted {
+            template<typename T>
+            using ContainerType = thrust::device_vector<T, typename AllocType::template rebind<T>::other>;
+            using MatrixType = nsparse::matrix<bool, IndexType, AllocType>;
+            using LargeIndexType = unsigned long;
+
+            static_assert(sizeof(LargeIndexType) > sizeof(IndexType), "Values intersection index must be larger");
+
+            static void fillIndices(const MatrixType& m, ContainerType<LargeIndexType>& out) {
+                thrust::for_each(thrust::counting_iterator<IndexType>(0), thrust::counting_iterator<IndexType>(m.m_vals),
+                        [rowOffset=m.m_row_index.data(), colIndex=m.m_col_index.data(),
+                         outIndices=out.data(), nrows=m.m_rows, ncols=m.m_cols] __device__ (IndexType valueId) {
+                    LargeIndexType row = findNearestRowIdx<index>(valueId, nrows, rowOffset);
+                    LargeIndexType col = colIndex[valueId];
+                    LargeIndexType index = row * ncols + col;
+                    outIndices[valueId] = index;
+                });
+            }
+
+            MatrixType operator()(const MatrixType& a, const MatrixType& b) {
+                auto aNvals = a.m_vals;
+                auto bNvals = b.m_vals;
+                auto worst = aNvals;
+
+                // Allocate memory for the worst case scenario
+                ContainerType<LargeIndexType> inputA(aNvals);
+                ContainerType<LargeIndexType> inputB(bNvals);
+
+                fillIndices(a, inputA);
+                fillIndices(b, inputB);
+
+                ContainerType<LargeIndexType> intersected(worst);
+
+                auto out = thrust::set_difference(inputA.begin(), inputA.end(),
+                                                  inputB.begin(), inputB.end(),
+                                                  intersected.begin());
+
+                // Count result nvals count
+                auto nvals = thrust::distance(intersected.begin(), out);
+
+                ContainerType<index> rowOffsetTmp(a.m_rows + 1);
+                ContainerType<index> colIndex(nvals);
+
+                thrust::fill(rowOffsetTmp.begin(), rowOffsetTmp.end(), 0);
+
+                thrust::for_each(thrust::counting_iterator<IndexType>(0), thrust::counting_iterator<IndexType>(nvals),
+                        [rowOffset=rowOffsetTmp.data(), colIndex=colIndex.data(), intersected=intersected.data(),
+                         nrows=a.m_rows, ncols=a.m_cols] __device__ (IndexType valueId) {
+                    LargeIndexType i = intersected[valueId];
+                    LargeIndexType row = i / ncols;
+                    LargeIndexType col = i % ncols;
+                    atomicAdd((rowOffset + row).get(), 1);
+                    colIndex[valueId] = (IndexType) col;
+                });
+
+                ContainerType<index> rowOffset(a.m_rows + 1);
+                thrust::exclusive_scan(rowOffsetTmp.begin(), rowOffsetTmp.end(), rowOffset.begin(), 0, thrust::plus<index>());
+
+                assert(nvals == rowOffset.back());
+
+                return MatrixType(std::move(colIndex), std::move(rowOffset), a.m_rows, a.m_cols, nvals);
+            }
+        };
+
+    }
+}
+
+#endif //CUBOOL_SPEWISEMULT_CUH
diff --git a/cubool/sources/cuda/kernels/spgemv.cuh b/cubool/sources/cuda/kernels/spgemv.cuh
index 7340174..818ff62 100644
--- a/cubool/sources/cuda/kernels/spgemv.cuh
+++ b/cubool/sources/cuda/kernels/spgemv.cuh
@@ -32,6 +32,8 @@
 #include <nsparse/detail/meta.h>
 #include <limits>
 
+#include <thrust/sort.h>
+
 namespace cubool {
     namespace kernels {
 
diff --git a/cubool/sources/cuda/kernels/spgemv_t.cuh b/cubool/sources/cuda/kernels/spgemv_t.cuh
index 6225d71..2a5db83 100644
--- a/cubool/sources/cuda/kernels/spgemv_t.cuh
+++ b/cubool/sources/cuda/kernels/spgemv_t.cuh
@@ -32,6 +32,8 @@
 #include <nsparse/detail/meta.h>
 #include <limits>
 
+#include <thrust/sort.h>
+
 namespace cubool {
     namespace kernels {
 
diff --git a/cubool/sources/cuda/kernels/spmerge.cuh b/cubool/sources/cuda/kernels/spmerge.cuh
index 8a14f8d..4734e35 100644
--- a/cubool/sources/cuda/kernels/spmerge.cuh
+++ b/cubool/sources/cuda/kernels/spmerge.cuh
@@ -29,9 +29,13 @@
 #include <nsparse/matrix.h>
 #include <nsparse/detail/merge.h>
 
+#include <cuda/kernels/bin_search.cuh>
+#include <thrust/set_operations.h>
+
 namespace cubool {
     namespace kernels {
 
+#if CUBOOL_USE_NSPARSE_MERGE_FUNCTOR
         template <typename IndexType, typename AllocType>
         class SpMergeFunctor {
         public:
@@ -53,7 +57,7 @@ namespace cubool {
                 assert(a.m_cols == b.m_cols);
 
                 IndexType rows = a.m_rows;
-                IndexType cols = a.m_rows;
+                IndexType cols = a.m_cols;
 
                 constexpr auto config_merge =
                     make_bin_seq<
@@ -77,6 +81,72 @@ namespace cubool {
         private:
             nsparse::unique_merge_functor_t<IndexType, AllocType> uniqueMergeFunctor;
         };
+#else
+        template <typename IndexType, typename AllocType>
+        class SpMergeFunctor {
+        public:
+            template<typename T>
+            using ContainerType = thrust::device_vector<T, typename AllocType::template rebind<T>::other>;
+            using MatrixType = nsparse::matrix<bool, IndexType, AllocType>;
+            using LargeIndexType = unsigned long;
+
+            static void fillIndices(const MatrixType& m, ContainerType<LargeIndexType>& out) {
+                thrust::for_each(thrust::counting_iterator<IndexType>(0), thrust::counting_iterator<IndexType>(m.m_vals),
+                        [rowOffset=m.m_row_index.data(), colIndex=m.m_col_index.data(),
+                         outIndices=out.data(), nrows=m.m_rows, ncols=m.m_cols] __device__ (IndexType valueId) {
+                    LargeIndexType row = findNearestRowIdx<index>(valueId, nrows, rowOffset);
+                    LargeIndexType col = colIndex[valueId];
+                    LargeIndexType index = row * ncols + col;
+                    outIndices[valueId] = index;
+                });
+            }
+
+            MatrixType operator()(const MatrixType& a, const MatrixType& b) {
+                auto aNvals = a.m_vals;
+                auto bNvals = b.m_vals;
+                auto worst = aNvals + bNvals;
+
+                // Allocate memory for the worst case scenario
+                ContainerType<LargeIndexType> inputA(aNvals);
+                ContainerType<LargeIndexType> inputB(bNvals);
+
+                fillIndices(a, inputA);
+                fillIndices(b, inputB);
+
+                ContainerType<LargeIndexType> intersected(worst);
+
+                auto out = thrust::set_union(inputA.begin(), inputA.end(),
+                                             inputB.begin(), inputB.end(),
+                                             intersected.begin());
+
+                // Count result nvals count
+                auto nvals = thrust::distance(intersected.begin(), out);
+
+                ContainerType<index> rowOffsetTmp(a.m_rows + 1);
+                ContainerType<index> colIndex(nvals);
+
+                thrust::fill(rowOffsetTmp.begin(), rowOffsetTmp.end(), 0);
+
+                thrust::for_each(thrust::counting_iterator<IndexType>(0), thrust::counting_iterator<IndexType>(nvals),
+                        [rowOffset=rowOffsetTmp.data(), colIndex=colIndex.data(), intersected=intersected.data(),
+                         nrows=a.m_rows, ncols=a.m_cols] __device__ (IndexType valueId) {
+                    LargeIndexType i = intersected[valueId];
+                    LargeIndexType row = i / ncols;
+                    LargeIndexType col = i % ncols;
+                    atomicAdd((rowOffset + row).get(), 1);
+                    colIndex[valueId] = (IndexType) col;
+                });
+
+                ContainerType<index> rowOffset(a.m_rows + 1);
+                thrust::exclusive_scan(rowOffsetTmp.begin(), rowOffsetTmp.end(), rowOffset.begin(), 0, thrust::plus<index>());
+
+                assert(nvals == rowOffset.back());
+
+                return MatrixType(std::move(colIndex), std::move(rowOffset), a.m_rows, a.m_cols, nvals);
+            }
+        };
+
+#endif
 
     }
 }
diff --git a/cubool/sources/cuda/kernels/spreduce.cuh b/cubool/sources/cuda/kernels/spreduce.cuh
index 3a15bb1..cfcf398 100644
--- a/cubool/sources/cuda/kernels/spreduce.cuh
+++ b/cubool/sources/cuda/kernels/spreduce.cuh
@@ -30,6 +30,8 @@
 #include <thrust/device_vector.h>
 #include <nsparse/matrix.h>
 
+#include <thrust/sort.h>
+
 namespace cubool {
     namespace kernels {
 
diff --git a/cubool/sources/cuda/kernels/sptranspose2.cuh b/cubool/sources/cuda/kernels/sptranspose2.cuh
index 8c50e45..c686818 100644
--- a/cubool/sources/cuda/kernels/sptranspose2.cuh
+++ b/cubool/sources/cuda/kernels/sptranspose2.cuh
@@ -29,6 +29,8 @@
 #include <thrust/device_vector.h>
 #include <nsparse/matrix.h>
 
+#include <thrust/sort.h>
+
 namespace cubool {
     namespace kernels {
 
diff --git a/cubool/sources/sequential/sq_ewisemultinverted.cpp b/cubool/sources/sequential/sq_ewisemultinverted.cpp
new file mode 100644
index 0000000..ee810c9
--- /dev/null
+++ b/cubool/sources/sequential/sq_ewisemultinverted.cpp
@@ -0,0 +1,105 @@
+/**********************************************************************************/
+/* MIT License                                                                    */
+/*                                                                                */
+/* Copyright (c) 2020, 2021 JetBrains-Research                                    */
+/*                                                                                */
+/* Permission is hereby granted, free of charge, to any person obtaining a copy   */
+/* of this software and associated documentation files (the "Software"), to deal  */
+/* in the Software without restriction, including without limitation the rights   */
+/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell      */
+/* copies of the Software, and to permit persons to whom the Software is          */
+/* furnished to do so, subject to the following conditions:                       */
+/*                                                                                */
+/* The above copyright notice and this permission notice shall be included in all */
+/* copies or substantial portions of the Software.                                */
+/*                                                                                */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR     */
+/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,       */
+/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE    */
+/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER         */
+/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  */
+/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  */
+/* SOFTWARE.                                                                      */
+/**********************************************************************************/
+
+#include <sequential/sq_ewisemult.hpp>
+#include <utils/algo_utils.hpp>
+
+namespace cubool {
+
+    void sq_ewisemultinverted(const CsrData& a, const CsrData& b, CsrData& out) {
+        out.rowOffsets.resize(a.nrows + 1, 0);
+
+        size_t nvals = 0;
+
+        // Count nnz of the result matrix to allocate memory
+        for (index i = 0; i < a.nrows; i++) {
+            index ak = a.rowOffsets[i];
+            index bk = b.rowOffsets[i];
+            index asize = a.rowOffsets[i + 1] - ak;
+            index bsize = b.rowOffsets[i + 1] - bk;
+
+            const index* ar = &a.colIndices[ak];
+            const index* br = &b.colIndices[bk];
+            const index* arend = ar + asize;
+            const index* brend = br + bsize;
+
+            index nvalsInRow = 0;
+
+            while (ar != arend && br != brend) {
+                if (*ar == *br) {
+                    ar++;
+                    br++;
+                }
+                else if (*ar < *br) {
+                    nvalsInRow++;
+                    ar++;
+                }
+                else {
+                    br++;
+                }
+            }
+            nvalsInRow += arend - ar;
+
+            nvals += nvalsInRow;
+            out.rowOffsets[i] = nvalsInRow;
+        }
+
+        // Eval row offsets
+        exclusive_scan(out.rowOffsets.begin(), out.rowOffsets.end(), 0);
+
+        // Allocate memory for values
+        out.nvals = nvals;
+        out.colIndices.resize(nvals);
+
+        // Fill sorted column indices
+        size_t k = 0;
+        for (index i = 0; i < a.nrows; i++) {
+            const index* ar = &a.colIndices[a.rowOffsets[i]];
+            const index* br = &b.colIndices[b.rowOffsets[i]];
+            const index* arend = &a.colIndices[a.rowOffsets[i + 1]];
+            const index* brend = &b.colIndices[b.rowOffsets[i + 1]];
+
+            while (ar != arend && br != brend) {
+                if (*ar == *br) {
+                    ar++;
+                    br++;
+                }
+                else if (*ar < *br) {
+                    out.colIndices[k] = *ar;
+                    k++;
+                    ar++;
+                }
+                else {
+                    br++;
+                }
+            }
+
+            while (ar != arend) {
+                out.colIndices[k] = *ar;
+                k++;
+                ar++;
+            }
+        }
+    }
+}
diff --git a/cubool/sources/sequential/sq_ewisemultinverted.hpp b/cubool/sources/sequential/sq_ewisemultinverted.hpp
new file mode 100644
index 0000000..99b2615
--- /dev/null
+++ b/cubool/sources/sequential/sq_ewisemultinverted.hpp
@@ -0,0 +1,42 @@
+/**********************************************************************************/
+/* MIT License                                                                    */
+/*                                                                                */
+/* Copyright (c) 2020, 2021 JetBrains-Research                                    */
+/*                                                                                */
+/* Permission is hereby granted, free of charge, to any person obtaining a copy   */
+/* of this software and associated documentation files (the "Software"), to deal  */
+/* in the Software without restriction, including without limitation the rights   */
+/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell      */
+/* copies of the Software, and to permit persons to whom the Software is          */
+/* furnished to do so, subject to the following conditions:                       */
+/*                                                                                */
+/* The above copyright notice and this permission notice shall be included in all */
+/* copies or substantial portions of the Software.                                */
+/*                                                                                */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR     */
+/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,       */
+/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE    */
+/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER         */
+/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  */
+/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  */
+/* SOFTWARE.                                                                      */
+/**********************************************************************************/
+
+#ifndef CUBOOL_SQ_EWISEMULTINVERTED_HPP
+#define CUBOOL_SQ_EWISEMULTINVERTED_HPP
+
+#include <sequential/sq_data.hpp>
+
+namespace cubool {
+
+    /**
+     * Element-wise multiplication of the matrices `a` and inverted `b`.
+     *
+     * @param a Input matrix
+     * @param b Input matrix, which will be inverted before multiplication
+     * @param[out] out Where to store the result
+     */
+    void sq_ewisemultinverted(const CsrData& a, const CsrData& b, CsrData& out);
+}
+
+#endif //CUBOOL_SQ_EWISEMULTINVERTED_HPP
diff --git a/cubool/sources/sequential/sq_matrix.cpp b/cubool/sources/sequential/sq_matrix.cpp
index df96e4e..00473f1 100644
--- a/cubool/sources/sequential/sq_matrix.cpp
+++ b/cubool/sources/sequential/sq_matrix.cpp
@@ -28,6 +28,7 @@
 #include <sequential/sq_kronecker.hpp>
 #include <sequential/sq_ewiseadd.hpp>
 #include <sequential/sq_ewisemult.hpp>
+#include <sequential/sq_ewisemultinverted.hpp>
 #include <sequential/sq_spgemm.hpp>
 #include <sequential/sq_reduce.hpp>
 #include <utils/data_utils.hpp>
@@ -235,6 +236,31 @@ namespace cubool {
         this->mData = std::move(out);
     }
 
+    void SqMatrix::eWiseMultInverted(const MatrixBase &aBase, const MatrixBase &bBase, bool checkTime) {
+        auto a = dynamic_cast<const SqMatrix*>(&aBase);
+        auto b = dynamic_cast<const SqMatrix*>(&bBase);
+
+        CHECK_RAISE_ERROR(a != nullptr, InvalidArgument, "Provided matrix does not belongs to sequential matrix class");
+        CHECK_RAISE_ERROR(b != nullptr, InvalidArgument, "Provided matrix does not belongs to sequential matrix class");
+
+        assert(a->getNrows() == this->getNrows());
+        assert(a->getNcols() == this->getNcols());
+        assert(a->getNrows() == b->getNrows());
+        assert(a->getNcols() == b->getNcols());
+
+        CsrData out;
+        out.nrows = this->getNrows();
+        out.ncols = this->getNcols();
+
+        a->allocateStorage();
+        b->allocateStorage();
+        sq_ewisemultinverted(a->mData, b->mData, out);
+
+        this->mData = std::move(out);
+    }
+
+
+
     index SqMatrix::getNrows() const {
         return mData.nrows;
     }
diff --git a/cubool/sources/sequential/sq_matrix.hpp b/cubool/sources/sequential/sq_matrix.hpp
index 1a3f539..d348232 100644
--- a/cubool/sources/sequential/sq_matrix.hpp
+++ b/cubool/sources/sequential/sq_matrix.hpp
@@ -51,6 +51,7 @@ namespace cubool {
         void kronecker(const MatrixBase &aBase, const MatrixBase &bBase, bool checkTime) override;
         void eWiseAdd(const MatrixBase &aBase, const MatrixBase &bBase, bool checkTime) override;
         void eWiseMult(const MatrixBase &a, const MatrixBase &b, bool checkTime) override;
+        void eWiseMultInverted(const MatrixBase &a, const MatrixBase &b, bool checkTime) override;
 
         index getNrows() const override;
         index getNcols() const override;
diff --git a/cubool/sources/utils/data_utils.hpp b/cubool/sources/utils/data_utils.hpp
index 2356382..b7863b3 100644
--- a/cubool/sources/utils/data_utils.hpp
+++ b/cubool/sources/utils/data_utils.hpp
@@ -27,6 +27,7 @@
 
 #include <core/config.hpp>
 #include <vector>
+#include <limits>
 
 namespace cubool {
 
diff --git a/cubool/tests/CMakeLists.txt b/cubool/tests/CMakeLists.txt
index 0e0f620..84ddc96 100644
--- a/cubool/tests/CMakeLists.txt
+++ b/cubool/tests/CMakeLists.txt
@@ -34,6 +34,9 @@ target_link_libraries(test_matrix_ewiseadd PUBLIC testing)
 add_executable(test_matrix_ewisemult test_matrix_ewisemult.cpp)
 target_link_libraries(test_matrix_ewisemult PUBLIC testing)
 
+add_executable(test_matrix_ewisemult_inverted test_matrix_ewisemult_inverted.cpp)
+target_link_libraries(test_matrix_ewisemult_inverted PUBLIC testing)
+
 add_executable(test_vector_misc test_vector_misc.cpp)
 target_link_libraries(test_vector_misc PUBLIC testing)
 
diff --git a/cubool/tests/test_matrix_ewisemult_inverted.cpp b/cubool/tests/test_matrix_ewisemult_inverted.cpp
new file mode 100644
index 0000000..250ba55
--- /dev/null
+++ b/cubool/tests/test_matrix_ewisemult_inverted.cpp
@@ -0,0 +1,109 @@
+#include "cubool/cubool.h"
+#include <testing/testing.hpp>
+#include <algorithm>
+
+using DataMatrix = std::vector<std::vector<int>>;
+
+void testApplyNotMask(const DataMatrix &matrix_data, const DataMatrix &mask_data) {
+    cuBool_Index nrows, ncols;
+    nrows = matrix_data.size();
+    ncols = matrix_data[0].size();
+    testing::Matrix test_matrix = testing::Matrix::generatet(nrows, ncols,
+        [&matrix_data](cuBool_Index i, cuBool_Index j) { return matrix_data[i][j]; });
+    nrows = mask_data.size();
+    ncols = mask_data[0].size();
+    testing::Matrix test_mask = testing::Matrix::generatet(nrows, ncols,
+        [&mask_data](cuBool_Index i, cuBool_Index j) { return mask_data[i][j]; });
+
+    cuBool_Matrix matrix, mask, result;
+    ASSERT_EQ(cuBool_Matrix_New(&matrix, test_matrix.nrows, test_matrix.ncols), CUBOOL_STATUS_SUCCESS);
+    ASSERT_EQ(cuBool_Matrix_New(&mask, test_mask.nrows, test_mask.ncols), CUBOOL_STATUS_SUCCESS);
+    ASSERT_EQ(cuBool_Matrix_New(&result, test_matrix.nrows, test_matrix.ncols), CUBOOL_STATUS_SUCCESS);
+
+    ASSERT_EQ(cuBool_Matrix_Build(matrix, test_matrix.rowsIndex.data(), test_matrix.colsIndex.data(), test_matrix.nvals,
+        CUBOOL_HINT_VALUES_SORTED & CUBOOL_HINT_NO_DUPLICATES), CUBOOL_STATUS_SUCCESS);
+    ASSERT_EQ(cuBool_Matrix_Build(mask, test_mask.rowsIndex.data(), test_mask.colsIndex.data(), test_mask.nvals,
+        CUBOOL_HINT_VALUES_SORTED & CUBOOL_HINT_NO_DUPLICATES), CUBOOL_STATUS_SUCCESS);
+
+    cuBool_Matrix_EWiseMulInverted(result, matrix, mask, CUBOOL_HINT_NO);
+
+    // validate value of algorithm
+    cuBool_Index nvals;
+    cuBool_Matrix_Nvals(result, &nvals);
+    std::vector<cuBool_Index> rows(nvals), cols(nvals);
+    cuBool_Matrix_ExtractPairs(result, rows.data(), cols.data(), &nvals);
+
+    cuBool_Matrix_Free(matrix);
+    cuBool_Matrix_Free(mask);
+    cuBool_Matrix_Free(result);
+
+    auto mask_data_inverted = mask_data;
+    for (auto &row : mask_data_inverted) {
+        for (int &value : row) {
+            value = !value;
+        }
+    }
+
+    std::vector result_data(matrix_data.size(), std::vector(matrix_data[0].size(), 0));
+    for (int i = 0; i < nvals; i++) {
+        result_data[rows[i]][cols[i]] = 1;
+    }
+
+    for (int i = 0; i < matrix_data.size(); i++) {
+        for (int j = 0; j < matrix_data[0].size(); j++) {
+            ASSERT_EQ(matrix_data[i][j] * mask_data_inverted[i][j], result_data[i][j]);
+        }
+    }
+
+}
+
+TEST(cuBool_Matrix, ApplyMatrix) {
+    ASSERT_EQ(cuBool_Initialize(CUBOOL_HINT_NO), CUBOOL_STATUS_SUCCESS);
+
+    DataMatrix matrix {
+        {1, 0, 0},
+        {0, 0, 0},
+        {0, 1, 0},
+    };
+
+    DataMatrix mask {
+        {0, 1, 1},
+        {1, 0, 1},
+        {0, 1, 1},
+    };
+    // iverted is
+    // 1 0 0
+    // 0 1 0
+    // 1 0 0
+    // matrix & ~mask must have (0, 0)
+
+    testApplyNotMask(matrix, mask);
+
+    ASSERT_EQ(cuBool_Finalize(), CUBOOL_STATUS_SUCCESS);
+}
+
+TEST(cuBool_Matrix, ApplyMatrixRandom) {
+    ASSERT_EQ(cuBool_Initialize(CUBOOL_HINT_NO), CUBOOL_STATUS_SUCCESS);
+
+    for (int i = 0; i < 102; i++) {
+        int n = rand() % 10 + 1;
+        int m = rand() % 10 + 1;
+
+        DataMatrix matrix(n, std::vector(m, 0));
+        DataMatrix mask(n, std::vector(m, 0));
+
+        for (int i = 0; i < n; i++) {
+            for (int j = 0; j < m; j++) {
+                matrix[i][j] = rand() & 1;
+                mask[i][j] = rand() & 1;
+            }
+        }
+
+        testApplyNotMask(matrix, mask);
+    }
+
+    ASSERT_EQ(cuBool_Finalize(), CUBOOL_STATUS_SUCCESS);
+}
+
+
+CUBOOL_GTEST_MAIN
diff --git a/deps/cub b/deps/cub
deleted file mode 160000
index b229817..0000000
--- a/deps/cub
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit b229817e3963fc942c7cc2c61715a6b2b2c49bed
diff --git a/deps/gtest b/deps/gtest
index d9c309f..6910c9d 160000
--- a/deps/gtest
+++ b/deps/gtest
@@ -1 +1 @@
-Subproject commit d9c309fdab807b716c2cf4d4a42989b8c34f712a
+Subproject commit 6910c9d9165801d8827d628cb72eb7ea9dd538c5
diff --git a/deps/nsparse-um/CMakeLists.txt b/deps/nsparse-um/CMakeLists.txt
index ee5ac0d..91ed084 100644
--- a/deps/nsparse-um/CMakeLists.txt
+++ b/deps/nsparse-um/CMakeLists.txt
@@ -3,7 +3,7 @@ project(nsparse_um LANGUAGES CXX CUDA)
 
 add_library(nsparse_um INTERFACE)
 target_include_directories(nsparse_um INTERFACE include/)
-target_link_libraries(nsparse_um INTERFACE cub)
+target_link_libraries(nsparse_um INTERFACE)
 target_compile_options(nsparse_um INTERFACE $<$<COMPILE_LANGUAGE:CUDA>: --expt-relaxed-constexpr --expt-extended-lambda>)
 
 if (CUBOOL_BUILD_NSPARSE_TESTS)
diff --git a/deps/nsparse-um/include/nsparse/unified_allocator.h b/deps/nsparse-um/include/nsparse/unified_allocator.h
index ea6a98b..9c3b2f8 100644
--- a/deps/nsparse-um/include/nsparse/unified_allocator.h
+++ b/deps/nsparse-um/include/nsparse/unified_allocator.h
@@ -3,7 +3,6 @@
 #include <thrust/detail/config.h>
 #include <thrust/device_ptr.h>
 #include <thrust/mr/allocator.h>
-#include <thrust/memory/detail/device_system_resource.h>
 
 #include <limits>
 #include <stdexcept>
diff --git a/deps/nsparse-um/test/CMakeLists.txt b/deps/nsparse-um/test/CMakeLists.txt
index 3bece29..6a1aaff 100644
--- a/deps/nsparse-um/test/CMakeLists.txt
+++ b/deps/nsparse-um/test/CMakeLists.txt
@@ -3,7 +3,7 @@ project(nsparse_um_test CXX CUDA)
 
 add_executable(${PROJECT_NAME} src/nsparse_test.cu src/utils.cpp)
 
-target_link_libraries(${PROJECT_NAME} PRIVATE gtest_main nsparse_um cub)
+target_link_libraries(${PROJECT_NAME} PRIVATE gtest_main nsparse_um)
 target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_14)
 
 set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 17)
diff --git a/deps/nsparse/test/CMakeLists.txt b/deps/nsparse/test/CMakeLists.txt
index 9e71d90..5abd415 100644
--- a/deps/nsparse/test/CMakeLists.txt
+++ b/deps/nsparse/test/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.11)
 project(nsparse_test CXX CUDA)
 
 add_executable(${PROJECT_NAME} src/nsparse_test.cu src/utils.cpp)
-target_link_libraries(${PROJECT_NAME} PRIVATE gtest_main nsparse cub)
+target_link_libraries(${PROJECT_NAME} PRIVATE gtest_main nsparse)
 target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_14)
 
 set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 17)
diff --git a/python/publish_pypi.sh b/python/publish_pypi.sh
index 40d4c65..0c78e87 100644
--- a/python/publish_pypi.sh
+++ b/python/publish_pypi.sh
@@ -1,2 +1,3 @@
+#!/bin/bash
 python3 setup.py sdist bdist_wheel
 twine upload --verbose dist/*
\ No newline at end of file
diff --git a/python/publish_t_pypi.sh b/python/publish_t_pypi.sh
index aa781e1..d68cbdd 100644
--- a/python/publish_t_pypi.sh
+++ b/python/publish_t_pypi.sh
@@ -1,2 +1,3 @@
+#!/bin/bash
 python3 setup.py sdist bdist_wheel
 twine upload --verbose --repository testpypi dist/*
\ No newline at end of file
diff --git a/python/run_tests.sh b/python/run_tests.sh
index febf012..0c216b3 100644
--- a/python/run_tests.sh
+++ b/python/run_tests.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 export PYTHONPATH="`pwd`:$PYTHONPATH"
 cd tests
 python3 -m unittest discover -v
diff --git a/scripts/install_cuda_ubuntu.sh b/scripts/install_cuda_ubuntu.sh
index 8c7e5ed..019f2aa 100644
--- a/scripts/install_cuda_ubuntu.sh
+++ b/scripts/install_cuda_ubuntu.sh
@@ -1,8 +1,13 @@
+#!/bin/bash
 # Original script from https://github.com/ptheywood/cuda-cmake-github-actions
 
 CUDA_PACKAGES_IN=(
-    "command-line-tools"
-    "libraries-dev"
+    "cuda-compiler"
+    "cuda-cudart-dev"
+    "cuda-nvtx"
+    "cuda-nvrtc-dev"
+    "libcurand-dev" # 11-0+
+    "cuda-cccl" # 11.4+, provides cub and thrust. On 11.3 known as cuda-thrust-11-3
 )
 
 # returns 0 (true) if a >= b
@@ -13,7 +18,7 @@ function version_ge() {
 # returns 0 (true) if a > b
 function version_gt() {
     [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1
-    [ "$1" = "$2" ] && return 1 || version_ge $1 $2
+    [ "$1" = "$2" ] && return 1 || version_ge "$1" "$2"
 }
 # returns 0 (true) if a <= b
 function version_le() {
@@ -23,15 +28,16 @@ function version_le() {
 # returns 0 (true) if a < b
 function version_lt() {
     [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1
-    [ "$1" = "$2" ] && return 1 || version_le $1 $2
+    [ "$1" = "$2" ] && return 1 || version_le "$1" "$2"
 }
 
 ## Select CUDA version
 
 # Get the cuda version from the environment as $cuda.
-CUDA_VERSION_MAJOR_MINOR=${cuda}
+CUDA_VERSION_MAJOR_MINOR=${cuda:=12.8}
 
 # Split the version.
+# We (might/probably) don't know PATCH at this point - it depends which version gets installed.
 CUDA_MAJOR=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f1)
 CUDA_MINOR=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f2)
 CUDA_PATCH=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f3)
@@ -55,7 +61,7 @@ if [ -z "${CUDA_MINOR}" ] ; then
     exit 1
 fi
 # If we don't know the Ubuntu version, error.
-if [ -z ${UBUNTU_VERSION} ]; then
+if [ -z "${UBUNTU_VERSION}" ]; then
     echo "Error: Unknown Ubuntu version. Aborting."
     exit 1
 fi
@@ -64,43 +70,82 @@ fi
 ## Select CUDA packages to install
 CUDA_PACKAGES=""
 for package in "${CUDA_PACKAGES_IN[@]}"
-do :
+do : 
+    # @todo This is not perfect. Should probably provide a separate list for diff versions
     # cuda-compiler-X-Y if CUDA >= 9.1 else cuda-nvcc-X-Y
-    if [[ "${package}" == "nvcc" ]] && version_ge "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then
-        package="compiler"
-    elif [[ "${package}" == "compiler" ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then
-        package="nvcc"
+    if [[ "${package}" == "cuda-nvcc" ]] && version_ge "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then
+        package="cuda-compiler"
+    elif [[ "${package}" == "cuda-compiler" ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then
+        package="cuda-nvcc"
+    # CUB/Thrust  are packages in cuda-thrust in 11.3, but cuda-cccl in 11.4+
+    elif [[ "${package}" == "cuda-thrust" || "${package}" == "cuda-cccl" ]]; then
+        # CUDA cuda-thrust >= 11.4
+        if version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.4" ; then
+            package="cuda-cccl"
+        # Use cuda-thrust > 11.2
+        elif version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.3" ; then
+            package="cuda-thrust"
+        # Do not include this pacakge < 11.3
+        else
+            continue
+        fi
+    fi
+    # CUDA 11+ includes lib* / lib*-dev packages, which if they existed previously where cuda-cu*- / cuda-cu*-dev-
+    if [[ ${package} == libcu* ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "11.0" ; then
+        package="${package/libcu/cuda-cu}"
     fi
     # Build the full package name and append to the string.
-    CUDA_PACKAGES+=" cuda-${package}-${CUDA_MAJOR}-${CUDA_MINOR}"
+    CUDA_PACKAGES+=" ${package}-${CUDA_MAJOR}-${CUDA_MINOR}"
 done
 echo "CUDA_PACKAGES ${CUDA_PACKAGES}"
 
+CPU_ARCH="x86_64"
 PIN_FILENAME="cuda-ubuntu${UBUNTU_VERSION}.pin"
-PIN_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/${PIN_FILENAME}"
-APT_KEY_URL="http://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/7fa2af80.pub"
-REPO_URL="http://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/"
+PIN_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/${CPU_ARCH}/${PIN_FILENAME}"
+# apt keyring package now available https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/
+KERYRING_PACKAGE_FILENAME="cuda-keyring_1.1-1_all.deb"
+KEYRING_PACKAGE_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/${CPU_ARCH}/${KERYRING_PACKAGE_FILENAME}"
+REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/${CPU_ARCH}/"
 
 echo "PIN_FILENAME ${PIN_FILENAME}"
 echo "PIN_URL ${PIN_URL}"
-echo "APT_KEY_URL ${APT_KEY_URL}"
+echo "KEYRING_PACKAGE_URL ${KEYRING_PACKAGE_URL}"
 
-sudo rm -r /var/lib/apt/lists/*
-sudo apt-get clean
-sudo apt-get update
+# Detect if the script is being run as root, storing true/false in is_root.
+is_root=false
+if (( EUID == 0)); then
+   is_root=true
+fi
+# Find if sudo is available
+has_sudo=false
+if command -v sudo &> /dev/null ; then
+    has_sudo=true
+fi
+# Decide if we can proceed or not (root or sudo is required) and if so store whether sudo should be used or not.
+if [ "$is_root" = false ] && [ "$has_sudo" = false ]; then 
+    echo "Root or sudo is required. Aborting."
+    exit 1
+elif [ "$is_root" = false ] ; then
+    USE_SUDO=sudo
+else
+    USE_SUDO=
+fi
+
+$USE_SUDO rm -r /var/lib/apt/lists/*
+$USE_SUDO apt-get clean
+$USE_SUDO apt-get update
 
 ## Install
 echo "Adding CUDA Repository"
-wget ${PIN_URL}
-sudo mv ${PIN_FILENAME} /etc/apt/preferences.d/cuda-repository-pin-600
-sudo apt-key adv --fetch-keys ${APT_KEY_URL}
-sudo add-apt-repository "deb ${REPO_URL} /"
-sudo apt-get update
+wget "${PIN_URL}"
+$USE_SUDO mv "${PIN_FILENAME}" /etc/apt/preferences.d/cuda-repository-pin-600
+wget "${KEYRING_PACKAGE_URL}" && ${USE_SUDO} dpkg -i ${KERYRING_PACKAGE_FILENAME} && rm ${KERYRING_PACKAGE_FILENAME}
+$USE_SUDO add-apt-repository "deb ${REPO_URL} /"
+$USE_SUDO apt-get update
 
 echo "Installing CUDA packages ${CUDA_PACKAGES}"
-sudo apt-get -y install ${CUDA_PACKAGES}
 
-if [[ $? -ne 0 ]]; then
+if ! $USE_SUDO apt-get -y install "${CUDA_PACKAGES}"; then
     echo "CUDA Installation Error."
     exit 1
 fi
@@ -109,7 +154,8 @@ fi
 CUDA_PATH=/usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR}
 echo "CUDA_PATH=${CUDA_PATH}"
 export CUDA_PATH=${CUDA_PATH}
-
 export PATH="$CUDA_PATH/bin:$PATH"
 export LD_LIBRARY_PATH="$CUDA_PATH/lib:$LD_LIBRARY_PATH"
-nvcc -V
\ No newline at end of file
+export LD_LIBRARY_PATH="$CUDA_PATH/lib64:$LD_LIBRARY_PATH"
+# Check nvcc is now available.
+nvcc -V
diff --git a/scripts/run_tests_all.sh b/scripts/run_tests_all.sh
index 8f2e7e0..1fd405f 100644
--- a/scripts/run_tests_all.sh
+++ b/scripts/run_tests_all.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Runs all tests executables
 # Invoke this script within build directory
 ./cubool/tests/test_library_api
diff --git a/scripts/run_tests_fallback.sh b/scripts/run_tests_fallback.sh
index f468fc1..50a4f84 100644
--- a/scripts/run_tests_fallback.sh
+++ b/scripts/run_tests_fallback.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Runs all tests executables
 # Invoke this script within build directory
 ./cubool/tests/test_library_api