diff --git a/Makefile b/Makefile index 75b5fda..5df943b 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ OBJET = $(addprefix $(OBJDIR)/, \ ) CCAP = 86 -CUDA = /usr/local/cuda-11.8 +CUDA = /usr/local/cuda CXX = g++ CXXCUDA = /usr/bin/g++ @@ -30,11 +30,11 @@ NOSTR_BLOCKS_PER_GRID = 512 NOSTR_THREADS_PER_BLOCK = 256 KEYS_PER_THREAD_BATCH = 64 -CXXFLAGS = -DWITHGPU -m64 -mssse3 -Wno-write-strings -O2 -I$(SRCDIR) -I$(CUDA)/include \ +CXXFLAGS = -DWITHGPU -march=native -Wno-write-strings -O2 -I$(SRCDIR) -I$(CUDA)/include \ -DNOSTR_BLOCKS_PER_GRID=$(NOSTR_BLOCKS_PER_GRID) \ -DNOSTR_THREADS_PER_BLOCK=$(NOSTR_THREADS_PER_BLOCK) \ -DKEYS_PER_THREAD_BATCH=$(KEYS_PER_THREAD_BATCH) -LFLAGS = /usr/lib/x86_64-linux-gnu/libgmp.so.10 -lpthread -L$(CUDA)/lib64 -lcudart -lcurand +LFLAGS = -lgmp -lpthread -L$(CUDA)/lib64 -lcudart -lcurand NVCC = $(CUDA)/bin/nvcc #-------------------------------------------------------------------- @@ -42,7 +42,7 @@ NVCC = $(CUDA)/bin/nvcc all: rummage $(OBJDIR)/GPU/GPURummage.o: $(SRCDIR)/GPU/GPURummage.cu - $(NVCC) -allow-unsupported-compiler --compile --compiler-options -fPIC -ccbin $(CXXCUDA) -m64 -O2 -I$(SRCDIR) -I$(CUDA)/include \ + $(NVCC) -allow-unsupported-compiler --compile --compiler-options -fPIC -ccbin $(CXXCUDA) -O2 -I$(SRCDIR) -I$(CUDA)/include \ -DNOSTR_BLOCKS_PER_GRID=$(NOSTR_BLOCKS_PER_GRID) \ -DNOSTR_THREADS_PER_BLOCK=$(NOSTR_THREADS_PER_BLOCK) \ -DKEYS_PER_THREAD_BATCH=$(KEYS_PER_THREAD_BATCH) \ diff --git a/src/CPU/Int.cpp b/src/CPU/Int.cpp index dfa00a7..12ac65c 100644 --- a/src/CPU/Int.cpp +++ b/src/CPU/Int.cpp @@ -18,7 +18,9 @@ #include "Int.h" #include #include +#if !defined(__aarch64__) && !defined(__arm64__) && !defined(_M_ARM64) #include +#endif #define MAX(x,y) (((x)>(y))?(x):(y)) #define MIN(x,y) (((x)<(y))?(x):(y)) diff --git a/src/CPU/Int.h b/src/CPU/Int.h index 643798b..ead2993 100644 --- a/src/CPU/Int.h +++ b/src/CPU/Int.h @@ -215,6 +215,57 @@ class Int { #ifndef WIN64 +#if defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) + +// ===== ARM64 IMPLEMENTATIONS ===== + +static uint64_t inline _umul128(uint64_t a, uint64_t b, uint64_t *h) { + unsigned __int128 res = (unsigned __int128)a * (unsigned __int128)b; + *h = (uint64_t)(res >> 64); + return (uint64_t)res; +} + +static int64_t inline _mul128(int64_t a, int64_t b, int64_t *h) { + __int128 res = (__int128)a * (__int128)b; + *h = (int64_t)(res >> 64); + return (int64_t)res; +} + +static uint64_t inline _udiv128(uint64_t hi, uint64_t lo, uint64_t d, uint64_t *r) { + unsigned __int128 dividend = ((unsigned __int128)hi << 64) | lo; + *r = (uint64_t)(dividend % d); + return (uint64_t)(dividend / d); +} + +static uint64_t inline __rdtsc() { + uint64_t val; + __asm__ volatile("mrs %0, cntvct_el0" : "=r"(val)); + return val; +} + +#define __shiftright128(a,b,n) ((a)>>(n))|((b)<<(64-(n))) +#define __shiftleft128(a,b,n) ((b)<<(n))|((a)>>(64-(n))) + +static inline unsigned char _addcarry_u64(unsigned char c_in, uint64_t a, uint64_t b, uint64_t *out) { + unsigned __int128 sum = (unsigned __int128)a + (unsigned __int128)b + (unsigned __int128)c_in; + *out = (uint64_t)sum; + return (unsigned char)(sum >> 64); +} + +static inline unsigned char _subborrow_u64(unsigned char b_in, uint64_t a, uint64_t b, uint64_t *out) { + unsigned __int128 diff = (unsigned __int128)a - (unsigned __int128)b - (unsigned __int128)b_in; + *out = (uint64_t)diff; + return (unsigned char)((diff >> 64) & 1); +} + +#define _byteswap_uint64 __builtin_bswap64 +#define LZC(x) __builtin_clzll(x) +#define TZC(x) __builtin_ctzll(x) + +#else + +// ===== X86-64 IMPLEMENTATIONS ===== + // Missing intrinsics static uint64_t inline _umul128(uint64_t a, uint64_t b, uint64_t *h) { uint64_t rhi; @@ -257,6 +308,8 @@ static uint64_t inline __rdtsc() { #define LZC(x) __builtin_clzll(x) #define TZC(x) __builtin_ctzll(x) +#endif // ARM64 vs x86-64 + #else #include diff --git a/src/CPU/IntMod.cpp b/src/CPU/IntMod.cpp index 00c990a..1bd2e8c 100644 --- a/src/CPU/IntMod.cpp +++ b/src/CPU/IntMod.cpp @@ -16,7 +16,15 @@ */ #include "Int.h" -#include +#if defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) + #include + typedef int64x2_t __m128i; + #define _mm_slli_epi64(a, count) vshlq_n_s64(a, count) + #define _mm_add_epi64(a, b) vaddq_s64(a, b) + #define _mm_sub_epi64(a, b) vsubq_s64(a, b) +#else + #include +#endif #include #define MAX(x,y) (((x)>(y))?(x):(y)) @@ -234,6 +242,9 @@ void Int::DivStep62(Int* u,Int* v,int64_t* eta,int* pos,int64_t* uu,int64_t* uv, _u.m128i_u64[1] = 0; _v.m128i_u64[0] = 0; _v.m128i_u64[1] = 1; +#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) + _u = vsetq_lane_s64(1, vdupq_n_s64(0), 0); + _v = vsetq_lane_s64(1, vdupq_n_s64(0), 1); #else ((int64_t *)&_u)[0] = 1; ((int64_t *)&_u)[1] = 0; @@ -271,6 +282,11 @@ void Int::DivStep62(Int* u,Int* v,int64_t* eta,int* pos,int64_t* uu,int64_t* uv, *uv = _u.m128i_u64[1]; *vu = _v.m128i_u64[0]; *vv = _v.m128i_u64[1]; +#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) + *uu = vgetq_lane_s64(_u, 0); + *uv = vgetq_lane_s64(_u, 1); + *vu = vgetq_lane_s64(_v, 0); + *vv = vgetq_lane_s64(_v, 1); #else *uu = ((int64_t *)&_u)[0]; *uv = ((int64_t *)&_u)[1];