From eddaaedbb2aea42d77f159ab6b70151067d2b71f Mon Sep 17 00:00:00 2001 From: Chen Hu <86994637+Tigerrr07@users.noreply.github.com> Date: Fri, 21 Jan 2022 20:19:46 +0800 Subject: [PATCH 1/2] save --- CMakeLists.txt | 3 ++ main.cpp | 106 ++++++++++++++++++++++++++++++++----------------- 2 files changed, 73 insertions(+), 36 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 29b152c..6445e4e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,3 +7,6 @@ if (NOT CMAKE_BUILD_TYPE) endif() add_executable(main main.cpp) +target_compile_options(main PUBLIC -ffast-math -march=native) +find_package(OpenMP REQUIRED) +target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX) \ No newline at end of file diff --git a/main.cpp b/main.cpp index cf6369b..557a225 100644 --- a/main.cpp +++ b/main.cpp @@ -8,60 +8,94 @@ float frand() { return (float)rand() / RAND_MAX * 2 - 1; } +constexpr size_t length = 48; struct Star { - float px, py, pz; - float vx, vy, vz; - float mass; + float px[length], py[length], pz[length]; + float vx[length], vy[length], vz[length]; + float mass[length]; }; -std::vector stars; +Star stars; void init() { - for (int i = 0; i < 48; i++) { - stars.push_back({ - frand(), frand(), frand(), - frand(), frand(), frand(), - frand() + 1, - }); + #pragma omp simd + for (size_t i = 0; i < length; i++) { + stars.px[i] = frand(); + stars.py[i] = frand(); + stars.pz[i] = frand(); + stars.vx[i] = frand(); + stars.vy[i] = frand(); + stars.vz[i] = frand(); + stars.mass[i] = frand() + 1; } } -float G = 0.001; -float eps = 0.001; -float dt = 0.01; +constexpr float G = 0.001; +constexpr float eps = 0.001; +constexpr float dt = 0.01; +// 提前计算, 减少不必要的乘法 +constexpr float Gdt = G * dt; +constexpr float eps2 = eps * eps; +constexpr float G2 = G / 2; void step() { - for (auto &star: stars) { - for (auto &other: stars) { - float dx = other.px - star.px; - float dy = other.py - star.py; - float dz = other.pz - star.pz; - float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - d2 *= sqrt(d2); - star.vx += dx * other.mass * G * dt / d2; - star.vy += dy * other.mass * G * dt / d2; - star.vz += dz * other.mass * G * dt / d2; + for (size_t i = 0; i < length; i++) { + // 减少不必要的内存访问 + float spxi = stars.px[i]; + float spyi = stars.py[i]; + float spzi = stars.pz[i]; + + // 先累加到初始化为0的局部变量 + float tmp_vxi = 0; + float tmp_vyi = 0; + float tmp_vzi = 0; + + for (size_t j = 0; j < length; j++) { + float dx = stars.px[j] - spxi; + float dy = stars.py[j] - spyi; + float dz = stars.pz[j] - spzi; + float d2 = dx * dx + dy * dy + dz * dz + eps2; + d2 *= std::sqrt(d2); + tmp_vxi += dx * stars.mass[j] * Gdt / d2; + tmp_vyi += dy * stars.mass[j] * Gdt / d2; + tmp_vzi += dz * stars.mass[j] * Gdt / d2; } + // 累加结束后再写入到全局变量中 + stars.vx[i] += tmp_vxi; + stars.vy[i] += tmp_vyi; + stars.vz[i] += tmp_vzi; } - for (auto &star: stars) { - star.px += star.vx * dt; - star.py += star.vy * dt; - star.pz += star.vz * dt; + + for (size_t i = 0; i < length; i++) { + stars.px[i] += stars.vx[i] * dt; + stars.py[i] += stars.vy[i] * dt; + stars.pz[i] += stars.vz[i] * dt; } } float calc() { float energy = 0; - for (auto &star: stars) { - float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz; - energy += star.mass * v2 / 2; - for (auto &other: stars) { - float dx = other.px - star.px; - float dy = other.py - star.py; - float dz = other.pz - star.pz; - float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - energy -= other.mass * star.mass * G / sqrt(d2) / 2; + for (size_t i = 0; i < length; i++) { + float v2 = stars.vx[i] * stars.vx[i] + stars.vy[i] * stars.vy[i] + stars.vz[i] + stars.vz[i]; + energy += stars.mass[i] * v2 / 2; + + // 减少不必要的内存访问 + float pxi = stars.px[i]; + float pyi = stars.py[i]; + float pzi = stars.pz[i]; + float massi = stars.mass[i]; + + // 先累加到初始化为0的局部变量 + float tmp = 0; + for (size_t j = 0; j < length; j++) { + float dx = stars.px[j] - pxi; + float dy = stars.py[j] - pyi; + float dz = stars.pz[j] - pzi; + float d2 = dx * dx + dy * dy + dz * dz + eps2; + tmp += stars.mass[j] * massi * G2 / std::sqrt(d2); } + // 累加结束后写入 + energy -= tmp; } return energy; } From 34063987331605cc9a6c717e56cd8c39e1bfa879 Mon Sep 17 00:00:00 2001 From: Chen Hu <86994637+Tigerrr07@users.noreply.github.com> Date: Sat, 22 Jan 2022 09:42:38 +0800 Subject: [PATCH 2/2] hw04 done. --- CMakeLists.txt | 4 +--- main.cpp | 36 +++++++++++++++++++++++------------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6445e4e..ed308c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,4 @@ if (NOT CMAKE_BUILD_TYPE) endif() add_executable(main main.cpp) -target_compile_options(main PUBLIC -ffast-math -march=native) -find_package(OpenMP REQUIRED) -target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX) \ No newline at end of file +target_compile_options(main PUBLIC -ffast-math -march=native) \ No newline at end of file diff --git a/main.cpp b/main.cpp index 557a225..10d0574 100644 --- a/main.cpp +++ b/main.cpp @@ -4,8 +4,10 @@ #include #include +// 减少初始化时的乘法 +constexpr float RAND_MAX2 = 1.f / RAND_MAX * 2; float frand() { - return (float)rand() / RAND_MAX * 2 - 1; + return (float)rand() * RAND_MAX2 - 1; } constexpr size_t length = 48; @@ -18,7 +20,7 @@ struct Star { Star stars; void init() { - #pragma omp simd + #pragma GCC unroll 8 for (size_t i = 0; i < length; i++) { stars.px[i] = frand(); stars.py[i] = frand(); @@ -50,22 +52,27 @@ void step() { float tmp_vyi = 0; float tmp_vzi = 0; + #pragma GCC unroll 8 for (size_t j = 0; j < length; j++) { float dx = stars.px[j] - spxi; float dy = stars.py[j] - spyi; float dz = stars.pz[j] - spzi; float d2 = dx * dx + dy * dy + dz * dz + eps2; d2 *= std::sqrt(d2); - tmp_vxi += dx * stars.mass[j] * Gdt / d2; - tmp_vyi += dy * stars.mass[j] * Gdt / d2; - tmp_vzi += dz * stars.mass[j] * Gdt / d2; + // Gdt = G * dt 放到for循环外部 + d2 = 1.f / d2; + // 乘法变加法 + tmp_vxi += dx * stars.mass[j] * d2; + tmp_vyi += dy * stars.mass[j] * d2; + tmp_vzi += dz * stars.mass[j] * d2; } // 累加结束后再写入到全局变量中 - stars.vx[i] += tmp_vxi; - stars.vy[i] += tmp_vyi; - stars.vz[i] += tmp_vzi; + stars.vx[i] += tmp_vxi * Gdt; + stars.vy[i] += tmp_vyi * Gdt; + stars.vz[i] += tmp_vzi * Gdt; } + #pragma GCC unroll 8 for (size_t i = 0; i < length; i++) { stars.px[i] += stars.vx[i] * dt; stars.py[i] += stars.vy[i] * dt; @@ -76,26 +83,29 @@ void step() { float calc() { float energy = 0; for (size_t i = 0; i < length; i++) { - float v2 = stars.vx[i] * stars.vx[i] + stars.vy[i] * stars.vy[i] + stars.vz[i] + stars.vz[i]; - energy += stars.mass[i] * v2 / 2; - // 减少不必要的内存访问 float pxi = stars.px[i]; float pyi = stars.py[i]; float pzi = stars.pz[i]; float massi = stars.mass[i]; + + float v2 = stars.vx[i] * stars.vx[i] + stars.vy[i] * stars.vy[i] + stars.vz[i] * stars.vz[i]; + energy += massi * v2 / 2; // 先累加到初始化为0的局部变量 float tmp = 0; + #pragma GCC unroll 8 for (size_t j = 0; j < length; j++) { float dx = stars.px[j] - pxi; float dy = stars.py[j] - pyi; float dz = stars.pz[j] - pzi; float d2 = dx * dx + dy * dy + dz * dz + eps2; - tmp += stars.mass[j] * massi * G2 / std::sqrt(d2); + // 将massi = stars.mass[i]和G2 = G / 2放到for循环外部 + // 减少乘法次数 + tmp += stars.mass[j] / std::sqrt(d2); } // 累加结束后写入 - energy -= tmp; + energy -= tmp * massi * G2; } return energy; }