Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ OBJET = $(addprefix $(OBJDIR)/, \
)

CCAP = 86
CUDA = /usr/local/cuda-11.8
CUDA = /usr/local/cuda
CXX = g++
CXXCUDA = /usr/bin/g++

Expand All @@ -30,19 +30,19 @@ NOSTR_BLOCKS_PER_GRID = 512
NOSTR_THREADS_PER_BLOCK = 256
KEYS_PER_THREAD_BATCH = 64

CXXFLAGS = -DWITHGPU -m64 -mssse3 -Wno-write-strings -O2 -I$(SRCDIR) -I$(CUDA)/include \
CXXFLAGS = -DWITHGPU -march=native -Wno-write-strings -O2 -I$(SRCDIR) -I$(CUDA)/include \
-DNOSTR_BLOCKS_PER_GRID=$(NOSTR_BLOCKS_PER_GRID) \
-DNOSTR_THREADS_PER_BLOCK=$(NOSTR_THREADS_PER_BLOCK) \
-DKEYS_PER_THREAD_BATCH=$(KEYS_PER_THREAD_BATCH)
LFLAGS = /usr/lib/x86_64-linux-gnu/libgmp.so.10 -lpthread -L$(CUDA)/lib64 -lcudart -lcurand
LFLAGS = -lgmp -lpthread -L$(CUDA)/lib64 -lcudart -lcurand
NVCC = $(CUDA)/bin/nvcc

#--------------------------------------------------------------------

all: rummage

$(OBJDIR)/GPU/GPURummage.o: $(SRCDIR)/GPU/GPURummage.cu
$(NVCC) -allow-unsupported-compiler --compile --compiler-options -fPIC -ccbin $(CXXCUDA) -m64 -O2 -I$(SRCDIR) -I$(CUDA)/include \
$(NVCC) -allow-unsupported-compiler --compile --compiler-options -fPIC -ccbin $(CXXCUDA) -O2 -I$(SRCDIR) -I$(CUDA)/include \
-DNOSTR_BLOCKS_PER_GRID=$(NOSTR_BLOCKS_PER_GRID) \
-DNOSTR_THREADS_PER_BLOCK=$(NOSTR_THREADS_PER_BLOCK) \
-DKEYS_PER_THREAD_BATCH=$(KEYS_PER_THREAD_BATCH) \
Expand Down
2 changes: 2 additions & 0 deletions src/CPU/Int.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
#include "Int.h"
#include <string.h>
#include <math.h>
#if !defined(__aarch64__) && !defined(__arm64__) && !defined(_M_ARM64)
#include <emmintrin.h>
#endif

#define MAX(x,y) (((x)>(y))?(x):(y))
#define MIN(x,y) (((x)<(y))?(x):(y))
Expand Down
53 changes: 53 additions & 0 deletions src/CPU/Int.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,57 @@ class Int {

#ifndef WIN64

#if defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)

// ===== ARM64 IMPLEMENTATIONS =====

static uint64_t inline _umul128(uint64_t a, uint64_t b, uint64_t *h) {
unsigned __int128 res = (unsigned __int128)a * (unsigned __int128)b;
*h = (uint64_t)(res >> 64);
return (uint64_t)res;
}

static int64_t inline _mul128(int64_t a, int64_t b, int64_t *h) {
__int128 res = (__int128)a * (__int128)b;
*h = (int64_t)(res >> 64);
return (int64_t)res;
}

static uint64_t inline _udiv128(uint64_t hi, uint64_t lo, uint64_t d, uint64_t *r) {
unsigned __int128 dividend = ((unsigned __int128)hi << 64) | lo;
*r = (uint64_t)(dividend % d);
return (uint64_t)(dividend / d);
}

static uint64_t inline __rdtsc() {
uint64_t val;
__asm__ volatile("mrs %0, cntvct_el0" : "=r"(val));
return val;
}

#define __shiftright128(a,b,n) ((a)>>(n))|((b)<<(64-(n)))
#define __shiftleft128(a,b,n) ((b)<<(n))|((a)>>(64-(n)))

static inline unsigned char _addcarry_u64(unsigned char c_in, uint64_t a, uint64_t b, uint64_t *out) {
unsigned __int128 sum = (unsigned __int128)a + (unsigned __int128)b + (unsigned __int128)c_in;
*out = (uint64_t)sum;
return (unsigned char)(sum >> 64);
}

static inline unsigned char _subborrow_u64(unsigned char b_in, uint64_t a, uint64_t b, uint64_t *out) {
unsigned __int128 diff = (unsigned __int128)a - (unsigned __int128)b - (unsigned __int128)b_in;
*out = (uint64_t)diff;
return (unsigned char)((diff >> 64) & 1);
}

#define _byteswap_uint64 __builtin_bswap64
#define LZC(x) __builtin_clzll(x)
#define TZC(x) __builtin_ctzll(x)

#else

// ===== X86-64 IMPLEMENTATIONS =====

// Missing intrinsics
static uint64_t inline _umul128(uint64_t a, uint64_t b, uint64_t *h) {
uint64_t rhi;
Expand Down Expand Up @@ -257,6 +308,8 @@ static uint64_t inline __rdtsc() {
#define LZC(x) __builtin_clzll(x)
#define TZC(x) __builtin_ctzll(x)

#endif // ARM64 vs x86-64

#else

#include <intrin.h>
Expand Down
18 changes: 17 additions & 1 deletion src/CPU/IntMod.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,15 @@
*/

#include "Int.h"
#include <emmintrin.h>
#if defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
#include <arm_neon.h>
typedef int64x2_t __m128i;
#define _mm_slli_epi64(a, count) vshlq_n_s64(a, count)
#define _mm_add_epi64(a, b) vaddq_s64(a, b)
#define _mm_sub_epi64(a, b) vsubq_s64(a, b)
#else
#include <emmintrin.h>
#endif
#include <string.h>

#define MAX(x,y) (((x)>(y))?(x):(y))
Expand Down Expand Up @@ -234,6 +242,9 @@ void Int::DivStep62(Int* u,Int* v,int64_t* eta,int* pos,int64_t* uu,int64_t* uv,
_u.m128i_u64[1] = 0;
_v.m128i_u64[0] = 0;
_v.m128i_u64[1] = 1;
#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
_u = vsetq_lane_s64(1, vdupq_n_s64(0), 0);
_v = vsetq_lane_s64(1, vdupq_n_s64(0), 1);
#else
((int64_t *)&_u)[0] = 1;
((int64_t *)&_u)[1] = 0;
Expand Down Expand Up @@ -271,6 +282,11 @@ void Int::DivStep62(Int* u,Int* v,int64_t* eta,int* pos,int64_t* uu,int64_t* uv,
*uv = _u.m128i_u64[1];
*vu = _v.m128i_u64[0];
*vv = _v.m128i_u64[1];
#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
*uu = vgetq_lane_s64(_u, 0);
*uv = vgetq_lane_s64(_u, 1);
*vu = vgetq_lane_s64(_v, 0);
*vv = vgetq_lane_s64(_v, 1);
#else
*uu = ((int64_t *)&_u)[0];
*uv = ((int64_t *)&_u)[1];
Expand Down