Initial commit

v0lta · web-flow · commit 719b1faaed6b · 2025-03-12T17:08:05.000+01:00
diff --git a/.figures/mnist.png b/.figures/mnist.png
diff --git a/.github/setup.sh b/.github/setup.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+sudo apt install cmake
+pip install torch --index-url https://download.pytorch.org/whl/cpu
+pip install numpy
+cd build_cpu
+cmake -DCMAKE_PREFIX_PATH=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'` .
+cmake --build . --config Release
+./test-net
diff --git a/.github/workflows/build-and-run-tests.yaml b/.github/workflows/build-and-run-tests.yaml
@@ -0,0 +1,34 @@
+name: build-and-run-tests
+
+on: [push]
+
+jobs:
+  build-and-run-tests:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: get_packages
+        run: sudo apt update && sudo apt install cmake
+
+      - name: install_pytorch
+        run: pip3 install torch --index-url https://download.pytorch.org/whl/cpu
+
+      - name: install_numpy
+        run: pip install numpy
+
+      - name: get_repo
+        uses: actions/checkout@v3
+        with:
+          path: main
+
+      - name: configure_and_build
+        shell: bash
+        working-directory: ${{github.workspace}}/main/build_cpu
+        run: |
+          cmake -DCMAKE_PREFIX_PATH=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'` .
+          cmake --build . --config Release
+
+      - name: runtest
+        shell: bash
+        working-directory: ${{github.workspace}}/main/build_cpu
+        run: ./test-net
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+.vscode/
+dependencies/
+build/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,18 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+
+project(train-net LANGUAGES CXX CUDA)
+cmake_policy(SET CMP0004 OLD)
+find_package(Torch REQUIRED)
+
+# Enable CUDA language support
+find_package(CUDAToolkit REQUIRED)
+set(CUDA_SEPARABLE_COMPILATION ON)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS} ${CUDAToolkit_CXX_FLAGS} -pthread")
+
+add_executable(train-net source/train_net.cpp)
+target_link_libraries(train-net  "${CUDAToolkit_libraries} ${TORCH_LIBRARIES}")
+set_property(TARGET train-net PROPERTY CXX_STANDARD 17)
+
+add_executable(test-net tests/test_net.cpp)
+target_link_libraries(test-net  "${CUDAToolkit_libraries} ${TORCH_LIBRARIES}")
+set_property(TARGET test-net PROPERTY CXX_STANDARD 17)
diff --git a/README.md b/README.md
@@ -0,0 +1,50 @@
+## HPC exercise training a neural network on the MNIST data-set.
+- The exercise explores training a neural network using [the torch c++ api](https://pytorch.org/cppdocs/).
+
+![where_is_waldo](.figures/mnist.png)
+
+You will learn how to train a network to recognize handwritten digits. To do so we will use the mnist data-set.
+The image above shows example images. The exercise assumes you are working on the systems at the Juelich Supercomputing Centre.
+To solve this exercise look through the files in the `source` folder. `TODO`s mark parts of the code that require your attention.
+Come back to this readme for additional hints.
+
+- To get started on the JUWELS Booster load the modules
+``` bash
+Stages/2023 GCC/11.3.0  OpenMPI/4.1.4 CUDA/11.7 CMake PyTorch
+```
+
+- Use `mkdir build` to create your build directory. Change directory into your build folder and compile by running:
+```bash
+cmake -DCUDA_CUDA_LIB=/usr/lib64/libcuda.so -DCMAKE_PREFIX_PATH=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'` ..
+cmake --build . --config Release
+```
+
+- Navigate to `source/net.h` implement the constructor for the `Net` struct.
+The `Net` should implement a fully connected network
+
+$$
+    y = \ln(\sigma (W_3f_r(W_2 f_r(W_1 x + b_1)  + b_2) + b_3))
+$$
+
+with $W_1 \in \mathbb{R}^{h_1, n}, W_2 \in \mathbb{R}^{h_2, h_1}, W_3 \in \mathbb{R}^{m, h_2}$
+and $b_1 \in \mathbb{R}^{h_1}, b_2 \in \mathbb{R}^{h_2}, b_3 \in \mathbb{R}^{m}$, where
+$n$ denotes the input dimension $h_1$ the number of hidden neurons in the first layer $h_2$ the number of neurons in the second layer, and $m$ the number of output neurons.
+Finally $\sigma$ denotes the [softmax function](https://en.wikipedia.org/wiki/Softmax_function) and $\ln$ the natural logarithm.
+Use `register_module` to add `Linear` layers to the network. Linear layers that implement $Wx +b$ are provided by `torch::nn:Linear`.
+Move on to implement the forward pass. Follow the equation above, use `torch::relu` and
+`torch::log_softmax`. What happens if you choose `torch::sigmoid` instead of the ReLU?
+
+- Before training your network network implement the `acc` function in `source/train_net.cpp`. It should find the ratio of
+correctly identified digits, by comparing the `argmax` of the network output and the annotations.
+
+- Torch devices are defined i.e. by `torch::Device device = torch::kCPU;` move to GPUs by choosing `torch::kCUDA;` if cuda-GPUs are available. 
+
+- Finally iterate over the test data set and compute the test accuracy.
+
+-  Train and test your network by executing:
+```bash
+./train_net
+```
+
+- When your network has converged, you should measure more than 90% accuracy.
+
diff --git a/build_cpu/CMakeLists.txt b/build_cpu/CMakeLists.txt
@@ -0,0 +1,13 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+
+project(train-net LANGUAGES CXX)
+cmake_policy(SET CMP0004 OLD)
+find_package(Torch REQUIRED)
+
+add_executable(train-net ../source/train_net.cpp)
+target_link_libraries(train-net  "${TORCH_LIBRARIES}")
+set_property(TARGET train-net PROPERTY CXX_STANDARD 17)
+
+add_executable(test-net ../tests/test_net.cpp)
+target_link_libraries(test-net  "${TORCH_LIBRARIES}")
+set_property(TARGET test-net PROPERTY CXX_STANDARD 17)
diff --git a/data/t10k-images-idx3-ubyte b/data/t10k-images-idx3-ubyte
diff --git a/data/t10k-labels-idx1-ubyte b/data/t10k-labels-idx1-ubyte
diff --git a/data/train-images-idx3-ubyte b/data/train-images-idx3-ubyte
diff --git a/data/train-labels-idx1-ubyte b/data/train-labels-idx1-ubyte
diff --git a/source/net.h b/source/net.h
@@ -0,0 +1,31 @@
+#include <torch/torch.h>
+
+#ifndef NET_H
+#define NET_H
+
+// Define a new Module.
+struct Net : torch::nn::Module {
+  Net() {
+    // Construct and register three Linear submodules.
+    // TODO!
+    // call them i.e. fc1, fc2 or fc3.
+    // Your last Linear layer should have ten output neurons.
+  }
+
+  // Implement the Net's algorithm.
+  torch::Tensor forward(torch::Tensor x) {
+    // Use one of many tensor manipulation functions.
+    torch::Tensor y = torch::zeros_like(x);
+
+    // run the forward pass by accessing your modules
+    // and adding activations functions.
+    // the last function should be a log_softmax.
+    // TODO!
+    return y;
+  }
+
+  // Use one of many "standard library" modules.
+  torch::nn::Linear fc1{nullptr}, fc2{nullptr}, fc3{nullptr};
+};
+
+#endif
diff --git a/source/train_net.cpp b/source/train_net.cpp
@@ -0,0 +1,91 @@
+#include <torch/torch.h>
+#include <iostream>
+#include "net.h"
+
+double acc(torch::Tensor& preds, torch::Tensor& labels){
+  // compute the batch accuracy given the network classification and the labels.
+  // TODO
+  return 0.;
+}
+
+int main() {
+  torch::manual_seed(1);
+  // Create a new Net.
+  auto net = std::make_shared<Net>();
+
+  std::cout << "CUDA is available: " << torch::cuda::is_available() << std::endl;
+  // TODO: Move your data to the GPU.
+  
+  // Create a multi-threaded data loader for the MNIST dataset.
+  auto data_set = torch::data::datasets::MNIST("../data/")
+          .map(torch::data::transforms::Normalize<>(0.5, 0.5))
+          .map(torch::data::transforms::Stack<>());
+
+  auto data_loader = torch::data::make_data_loader<torch::data::samplers::SequentialSampler>(
+          std::move(data_set),
+          torch::data::DataLoaderOptions().batch_size(64).workers(2));
+
+  //TODO: move your network to the GPU
+
+  // Instantiate an SGD optimization algorithm to update our Net's parameters.
+  torch::optim::SGD optimizer(net->parameters(), /*lr=*/0.01);
+
+  for (size_t epoch = 1; epoch <= 20; ++epoch) {
+    size_t batch_index = 0;
+    // Iterate the data loader to yield batches from the dataset.
+    for (auto& batch : *data_loader) {
+      batch.target = batch.target.to(device, false);
+      batch.data = batch.data.to(device);
+      
+      // Reset gradients.
+      optimizer.zero_grad();
+      // Execute the model on the input data.
+
+      torch::Tensor prediction = net -> forward(batch.data);
+      // Compute a loss value to judge the prediction of our model.
+      torch::Tensor loss = torch::nll_loss(prediction, batch.target);
+      // Compute gradients of the loss w.r.t. the parameters of our model.
+      loss.backward();
+      // Update the parameters based on the calculated gradients.
+      optimizer.step();
+      // Output the loss and checkpoint every 100 batches.
+      
+      torch::Tensor net_choice = prediction.argmax(-1);
+      double accuracy = acc(net_choice, batch.target);
+
+      if (++batch_index % 100 == 0) {
+        std::cout << "Epoch: " << epoch << " | Batch: " << batch_index
+                  << " | Loss: " << loss.item<float>()
+                  << " | Accuracy:" << accuracy << std::endl;
+
+      }
+    }
+  }
+
+  (*net).to(torch::kCPU, false);
+  // Serialize your model periodically as a checkpoint.
+  torch::save(net, "../trained_net/net.pt");
+  std::cout << "model saved.";
+  std::cout << "started tests.";
+
+  auto test_set = torch::data::datasets::MNIST("../data/",
+                                               torch::data::datasets::MNIST::Mode::kTest)
+          .map(torch::data::transforms::Normalize<>(0.5, 0.5))
+          .map(torch::data::transforms::Stack<>());
+  // test our network
+  auto test_loader = torch::data::make_data_loader(
+          test_set,
+          torch::data::DataLoaderOptions().batch_size(64).workers(2));
+
+  double correct = 0;
+  double total = 0;
+  for (auto& test_batch:  *test_loader) {
+      // TODO: Loop over the test data and find the test accuracy
+      // The test accuracy is the ratio of currectly identified
+      // digits over the total number of digits.
+  }
+  double test_acc = 0.;
+  std::cout << "Total: " << total << " correct: " << correct << std::endl;
+  std::cout << "Test accuracy:" << test_acc << std::endl;
+
+}
diff --git a/tests/test_net.cpp b/tests/test_net.cpp
@@ -0,0 +1,40 @@
+//#include <gtest/gtest.h>
+#include <torch/torch.h>
+#include <iostream>
+#include "../source/net.h"
+
+int main() { 
+  auto net = std::make_shared<Net>();
+  torch::load(net, "../trained_net/net.pt");
+  std::cout << "model saved.";
+  std::cout << "started tests.";
+
+  auto test_set = torch::data::datasets::MNIST("../data/",
+                                               torch::data::datasets::MNIST::Mode::kTest)
+          .map(torch::data::transforms::Normalize<>(0.5, 0.5))
+          .map(torch::data::transforms::Stack<>());
+  // test our network
+  auto test_loader = torch::data::make_data_loader(
+          test_set,
+          torch::data::DataLoaderOptions().batch_size(64).workers(2));
+
+  double correct = 0;
+  double total = 0;
+  for (auto& test_batch:  *test_loader) {
+    torch::Tensor test_batch_data = test_batch.data;
+    torch::Tensor test_batch_labels = test_batch.target;
+    torch::Tensor test_out = net->forward(test_batch_data);
+    torch::Tensor test_out_max = test_out.argmax(-1);
+    for (int i = 0; i < test_out_max.size(0); i++){
+      total += 1;
+      if (test_out_max[i].item<int>() == test_batch_labels[i].item<int>()){
+        correct += 1;
+      }
+    }
+  }
+  double test_acc = correct / total;
+  std::cout << "Total: " << total << " correct: " << correct << std::endl;
+  std::cout << "Test accuracy:" << test_acc << std::endl;
+  assert(test_acc > 0.9);
+  return 0;
+}
diff --git a/trained_net/README.md b/trained_net/README.md
@@ -0,0 +1 @@
+Your trained network should appear here.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+.vscode/`
	`2`	`+dependencies/`
	`3`	`+build/`