From 2ff968d7725b0fddc99a1e202c51b35acb608eb3 Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 13:17:01 -0300 Subject: [PATCH 01/25] Create yacoin.h include the constants and functions needed for YaCoin - pulled out of the original scrypt-jane.c in ali1234's repository --- yacoin.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 yacoin.h diff --git a/yacoin.h b/yacoin.h new file mode 100644 index 000000000..75d8c13e8 --- /dev/null +++ b/yacoin.h @@ -0,0 +1,15 @@ +/* +include the constants and functions needed for YaCoin + +pulled out of the original scrypt-jane.c in ali1234's repository + +*/ + +const unsigned char minNfactor = 4; +const unsigned char maxNfactor = 30; + +unsigned char GetNfactor(unsigned int nTimestamp) + +int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) From 38bb28858a59b47e4278a56d0b8e6033ba1f9316 Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 13:18:53 -0300 Subject: [PATCH 02/25] Create yacoin.c --- yacoin.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 yacoin.c diff --git a/yacoin.c b/yacoin.c new file mode 100644 index 000000000..6b6edb12f --- /dev/null +++ b/yacoin.c @@ -0,0 +1,84 @@ +unsigned char GetNfactor(unsigned int nTimestamp) { + int l = 0; + + if (nTimestamp <= 1367991200) + return 4; + + unsigned long int s = nTimestamp - 1367991200; + while ((s >> 1) > 3) { + l += 1; + s >>= 1; + } + + s &= 3; + + int n = (l * 170 + s * 25 - 2320) / 100; + + if (n < 0) n = 0; + + if (n > 255) + printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n); + + unsigned char N = (unsigned char)n; + //printf("GetNfactor: %d -> %d %d : %d / %d\n", nTimestamp - nChainStartTime, l, s, n, min(max(N, minNfa$ + +// return min(max(N, minNfactor), maxNfactor); + + if(NmaxNfactor) return maxNfactor; + return N; +} + +int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t data[20], hash[8], target_swap[8]; + volatile unsigned char *hashc = (unsigned char *) hash; + volatile unsigned char *datac = (unsigned char *) data; + volatile unsigned char *pdatac = (unsigned char *) pdata; + uint32_t n = pdata[19] - 1; + int i; + + /* byte swap it */ + for(int z=0;z<20;z++) { + datac[(z*4) ] = pdatac[(z*4)+3]; + datac[(z*4)+1] = pdatac[(z*4)+2]; + datac[(z*4)+2] = pdatac[(z*4)+1]; + datac[(z*4)+3] = pdatac[(z*4) ]; + } + + int nfactor = GetNfactor(data[17]); + + do { + data[19] = ++n; + + scrypt((unsigned char *)data, 80, + (unsigned char *)data, 80, + nfactor, 0, 0, (unsigned char *)hash, 32); + + if (hashc[31] == 0 && hashc[30] == 0) { +/* + for(int z=7;z>=0;z--) + fprintf(stderr, "%08x ", hash[z]); + fprintf(stderr, "\n"); + + for(int z=7;z>=0;z--) + fprintf(stderr, "%08x ", ptarget[z]); + fprintf(stderr, "\n"); +*/ + if(fulltest(hash, ptarget)) { + *hashes_done = n - pdata[19] + 1; + pdatac[76] = datac[79]; + pdatac[77] = datac[78]; + pdatac[78] = datac[77]; + pdatac[79] = datac[76]; + return 1; + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - pdata[19] + 1; + pdata[19] = n; + return 0; +} From 5fe4ec16198e8b742c842fe83e773a780e89be5f Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 13:24:28 -0300 Subject: [PATCH 03/25] Create scrypt-jane.c --- scrypt-jane/scrypt-jane.c | 182 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 scrypt-jane/scrypt-jane.c diff --git a/scrypt-jane/scrypt-jane.c b/scrypt-jane/scrypt-jane.c new file mode 100644 index 000000000..57c600ba7 --- /dev/null +++ b/scrypt-jane/scrypt-jane.c @@ -0,0 +1,182 @@ +/* + scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane + + Public Domain or MIT License, whichever is easier +*/ + +#include + +#include "scrypt-jane.h" +#include "code/scrypt-jane-portable.h" +#include "code/scrypt-jane-hash.h" +#include "code/scrypt-jane-romix.h" +#include "code/scrypt-jane-test-vectors.h" + + +#define scrypt_maxN 30 /* (1 << (30 + 1)) = ~2 billion */ +#if (SCRYPT_BLOCK_BYTES == 64) +#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */ +#elif (SCRYPT_BLOCK_BYTES == 128) +#define scrypt_r_32kb 7 /* (1 << 7) = 128 * 2 blocks in a chunk * 128 bytes = Max of 32kb in a chunk */ +#elif (SCRYPT_BLOCK_BYTES == 256) +#define scrypt_r_32kb 6 /* (1 << 6) = 64 * 2 blocks in a chunk * 256 bytes = Max of 32kb in a chunk */ +#elif (SCRYPT_BLOCK_BYTES == 512) +#define scrypt_r_32kb 5 /* (1 << 5) = 32 * 2 blocks in a chunk * 512 bytes = Max of 32kb in a chunk */ +#endif +#define scrypt_maxr scrypt_r_32kb /* 32kb */ +#define scrypt_maxp 25 /* (1 << 25) = ~33 million */ + +#include +#include + +static void +scrypt_fatal_error_default(const char *msg) { + fprintf(stderr, "%s\n", msg); + exit(1); +} + +static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default; + +void +scrypt_set_fatal_error_default(scrypt_fatal_errorfn fn) { + scrypt_fatal_error = fn; +} + +static int +scrypt_power_on_self_test() { + const scrypt_test_setting *t; + uint8_t test_digest[64]; + uint32_t i; + int res = 7, scrypt_valid; + + if (!scrypt_test_mix()) { +#if !defined(SCRYPT_TEST) + scrypt_fatal_error("scrypt: mix function power-on-self-test failed"); +#endif + res &= ~1; + } + + if (!scrypt_test_hash()) { +#if !defined(SCRYPT_TEST) + scrypt_fatal_error("scrypt: hash function power-on-self-test failed"); +#endif + res &= ~2; + } + + for (i = 0, scrypt_valid = 1; post_settings[i].pw; i++) { + t = post_settings + i; + scrypt((uint8_t *)t->pw, strlen(t->pw), (uint8_t *)t->salt, strlen(t->salt), t->Nfactor, t->rfactor, t->pfactor, test_digest, sizeof(test_digest)); + scrypt_valid &= scrypt_verify(post_vectors[i], test_digest, sizeof(test_digest)); + } + + if (!scrypt_valid) { +#if !defined(SCRYPT_TEST) + scrypt_fatal_error("scrypt: scrypt power-on-self-test failed"); +#endif + res &= ~4; + } + + return res; +} + +typedef struct scrypt_aligned_alloc_t { + uint8_t *mem, *ptr; +} scrypt_aligned_alloc; + +#if defined(SCRYPT_TEST_SPEED) +static uint8_t *mem_base = (uint8_t *)0; +static size_t mem_bump = 0; + +/* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */ +static scrypt_aligned_alloc +scrypt_alloc(uint64_t size) { + scrypt_aligned_alloc aa; + if (!mem_base) { + mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1)); + if (!mem_base) + scrypt_fatal_error("scrypt: out of memory"); + mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); + } + aa.mem = mem_base + mem_bump; + aa.ptr = aa.mem; + mem_bump += (size_t)size; + return aa; +} + +static void +scrypt_free(scrypt_aligned_alloc *aa) { + mem_bump = 0; +} +#else +static scrypt_aligned_alloc +scrypt_alloc(uint64_t size) { + static const size_t max_alloc = (size_t)-1; + scrypt_aligned_alloc aa; + size += (SCRYPT_BLOCK_BYTES - 1); + if (size > max_alloc) + scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory"); + aa.mem = (uint8_t *)malloc((size_t)size); + aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); + if (!aa.mem) + scrypt_fatal_error("scrypt: out of memory"); + return aa; +} + +static void +scrypt_free(scrypt_aligned_alloc *aa) { + free(aa->mem); +} +#endif + + +void +scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t Nfactor, uint8_t rfactor, uint8_t pfactor, uint8_t *out, size_t bytes) { + scrypt_aligned_alloc YX, V; + uint8_t *X, *Y; + uint32_t N, r, p, chunk_bytes, i; + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) + scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); +#endif + +#if !defined(SCRYPT_TEST) + static int power_on_self_test = 0; + if (!power_on_self_test) { + power_on_self_test = 1; + if (!scrypt_power_on_self_test()) + scrypt_fatal_error("scrypt: power on self test failed"); + } +#endif + + if (Nfactor > scrypt_maxN) + scrypt_fatal_error("scrypt: N out of range"); + if (rfactor > scrypt_maxr) + scrypt_fatal_error("scrypt: r out of range"); + if (pfactor > scrypt_maxp) + scrypt_fatal_error("scrypt: p out of range"); + + N = (1 << (Nfactor + 1)); + r = (1 << rfactor); + p = (1 << pfactor); + + chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2; + V = scrypt_alloc((uint64_t)N * chunk_bytes); + YX = scrypt_alloc((p + 1) * chunk_bytes); + + /* 1: X = PBKDF2(password, salt) */ + Y = YX.ptr; + X = Y + chunk_bytes; + scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes * p); + + /* 2: X = ROMix(X) */ + for (i = 0; i < p; i++) + scrypt_ROMix((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, N, r); + + /* 3: Out = PBKDF2(password, X) */ + scrypt_pbkdf2(password, password_len, X, chunk_bytes * p, 1, out, bytes); + + scrypt_ensure_zero(YX.ptr, (p + 1) * chunk_bytes); + + scrypt_free(&V); + scrypt_free(&YX); +} From 3532ca988891396cd62a39d7886329f46b8f8ddc Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 13:25:04 -0300 Subject: [PATCH 04/25] Create scrypt-jane.h --- scrypt-jane/scrypt-jane.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 scrypt-jane/scrypt-jane.h diff --git a/scrypt-jane/scrypt-jane.h b/scrypt-jane/scrypt-jane.h new file mode 100644 index 000000000..a682889cc --- /dev/null +++ b/scrypt-jane/scrypt-jane.h @@ -0,0 +1,27 @@ +#ifndef SCRYPT_JANE_H +#define SCRYPT_JANE_H + +/* + Nfactor: Increases CPU & Memory Hardness + N = (1 << (Nfactor + 1)): How many times to mix a chunk and how many temporary chunks are used + + rfactor: Increases Memory Hardness + r = (1 << rfactor): How large a chunk is + + pfactor: Increases CPU Hardness + p = (1 << pfactor): Number of times to mix the main chunk + + A block is the basic mixing unit (salsa/chacha block = 64 bytes) + A chunk is (2 * r) blocks + + ~Memory used = (N + 2) * ((2 * r) * block size) +*/ + +#include + +typedef void (*scrypt_fatal_errorfn)(const char *msg); +void scrypt_set_fatal_error(scrypt_fatal_errorfn fn); + +void scrypt(const unsigned char *password, size_t password_len, const unsigned char *salt, size_t salt_len, unsigned char Nfactor, unsigned char rfactor, unsigned char pfactor, unsigned char *out, size_t bytes); + +#endif /* SCRYPT_JANE_H */ From 81eb17082f8a2c316a6e6671ccfc3a22ec9fb12d Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 13:34:15 -0400 Subject: [PATCH 05/25] Merged ali1234 repository with floodberry's scrypt-jane repository still need to update cpu.miner.c to include the code that was previously in scrypt-jane.c that is specific to YaCoin --- scrypt-jane.c | 275 ----------- scrypt-jane.h | 27 -- scrypt-jane/README.md | 161 +++++++ scrypt-jane/code/scrypt-conf.h | 28 ++ .../code}/scrypt-jane-chacha.h | 8 +- {code => scrypt-jane/code}/scrypt-jane-hash.h | 0 scrypt-jane/code/scrypt-jane-hash_blake256.h | 177 +++++++ scrypt-jane/code/scrypt-jane-hash_blake512.h | 181 +++++++ .../code}/scrypt-jane-hash_keccak.h | 0 .../code}/scrypt-jane-hash_sha256.h | 0 scrypt-jane/code/scrypt-jane-hash_sha512.h | 152 ++++++ scrypt-jane/code/scrypt-jane-hash_skein512.h | 188 ++++++++ .../code}/scrypt-jane-mix_chacha-avx.h | 38 +- .../code}/scrypt-jane-mix_chacha-sse2.h | 2 +- .../code}/scrypt-jane-mix_chacha-ssse3.h | 38 +- .../code}/scrypt-jane-mix_chacha.h | 0 .../code}/scrypt-jane-mix_salsa-avx.h | 2 +- .../code}/scrypt-jane-mix_salsa-sse2.h | 4 +- .../code}/scrypt-jane-mix_salsa.h | 0 .../code/scrypt-jane-mix_salsa64-avx.h | 367 ++++++++++++++ .../code/scrypt-jane-mix_salsa64-sse2.h | 449 ++++++++++++++++++ .../code/scrypt-jane-mix_salsa64-ssse3.h | 399 ++++++++++++++++ scrypt-jane/code/scrypt-jane-mix_salsa64.h | 41 ++ .../code}/scrypt-jane-pbkdf2.h | 0 .../code}/scrypt-jane-portable-x86.h | 30 +- .../code}/scrypt-jane-portable.h | 0 .../code}/scrypt-jane-romix-basic.h | 8 +- .../code}/scrypt-jane-romix-template.h | 2 +- .../code}/scrypt-jane-romix.h | 0 .../code}/scrypt-jane-salsa.h | 3 + scrypt-jane/code/scrypt-jane-salsa64.h | 133 ++++++ .../code}/scrypt-jane-test-vectors.h | 0 scrypt-jane/example.c | 13 + scrypt-jane/scrypt-jane-speed.c | 121 +++++ scrypt-jane/scrypt-jane-test.c | 12 + scrypt-jane/scrypt-jane.c | 4 +- scrypt-jane/scrypt-jane.h | 2 +- scrypt-jane/test-speed.sh | 38 ++ scrypt-jane/test.sh | 44 ++ 39 files changed, 2611 insertions(+), 336 deletions(-) delete mode 100644 scrypt-jane.c delete mode 100644 scrypt-jane.h create mode 100644 scrypt-jane/README.md create mode 100644 scrypt-jane/code/scrypt-conf.h rename {code => scrypt-jane/code}/scrypt-jane-chacha.h (95%) rename {code => scrypt-jane/code}/scrypt-jane-hash.h (100%) create mode 100644 scrypt-jane/code/scrypt-jane-hash_blake256.h create mode 100644 scrypt-jane/code/scrypt-jane-hash_blake512.h rename {code => scrypt-jane/code}/scrypt-jane-hash_keccak.h (100%) rename {code => scrypt-jane/code}/scrypt-jane-hash_sha256.h (100%) create mode 100644 scrypt-jane/code/scrypt-jane-hash_sha512.h create mode 100644 scrypt-jane/code/scrypt-jane-hash_skein512.h rename {code => scrypt-jane/code}/scrypt-jane-mix_chacha-avx.h (91%) rename {code => scrypt-jane/code}/scrypt-jane-mix_chacha-sse2.h (99%) rename {code => scrypt-jane/code}/scrypt-jane-mix_chacha-ssse3.h (91%) rename {code => scrypt-jane/code}/scrypt-jane-mix_chacha.h (100%) rename {code => scrypt-jane/code}/scrypt-jane-mix_salsa-avx.h (99%) rename {code => scrypt-jane/code}/scrypt-jane-mix_salsa-sse2.h (99%) rename {code => scrypt-jane/code}/scrypt-jane-mix_salsa.h (100%) create mode 100644 scrypt-jane/code/scrypt-jane-mix_salsa64-avx.h create mode 100644 scrypt-jane/code/scrypt-jane-mix_salsa64-sse2.h create mode 100644 scrypt-jane/code/scrypt-jane-mix_salsa64-ssse3.h create mode 100644 scrypt-jane/code/scrypt-jane-mix_salsa64.h rename {code => scrypt-jane/code}/scrypt-jane-pbkdf2.h (100%) rename {code => scrypt-jane/code}/scrypt-jane-portable-x86.h (89%) rename {code => scrypt-jane/code}/scrypt-jane-portable.h (100%) rename {code => scrypt-jane/code}/scrypt-jane-romix-basic.h (85%) rename {code => scrypt-jane/code}/scrypt-jane-romix-template.h (98%) rename {code => scrypt-jane/code}/scrypt-jane-romix.h (100%) rename {code => scrypt-jane/code}/scrypt-jane-salsa.h (96%) create mode 100644 scrypt-jane/code/scrypt-jane-salsa64.h rename {code => scrypt-jane/code}/scrypt-jane-test-vectors.h (100%) create mode 100644 scrypt-jane/example.c create mode 100644 scrypt-jane/scrypt-jane-speed.c create mode 100644 scrypt-jane/scrypt-jane-test.c create mode 100644 scrypt-jane/test-speed.sh create mode 100644 scrypt-jane/test.sh diff --git a/scrypt-jane.c b/scrypt-jane.c deleted file mode 100644 index 6db656489..000000000 --- a/scrypt-jane.c +++ /dev/null @@ -1,275 +0,0 @@ -/* - scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane - - Public Domain or MIT License, whichever is easier -*/ - -#include "cpuminer-config.h" -#include "miner.h" - -#include - -#include "scrypt-jane.h" -#include "code/scrypt-jane-portable.h" -#include "code/scrypt-jane-hash.h" -#include "code/scrypt-jane-romix.h" -#include "code/scrypt-jane-test-vectors.h" - - -#define scrypt_maxN 30 /* (1 << (30 + 1)) = ~2 billion */ -#if (SCRYPT_BLOCK_BYTES == 64) -#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */ -#elif (SCRYPT_BLOCK_BYTES == 128) -#define scrypt_r_32kb 7 /* (1 << 7) = 128 * 2 blocks in a chunk * 128 bytes = Max of 32kb in a chunk */ -#elif (SCRYPT_BLOCK_BYTES == 256) -#define scrypt_r_32kb 6 /* (1 << 6) = 64 * 2 blocks in a chunk * 256 bytes = Max of 32kb in a chunk */ -#elif (SCRYPT_BLOCK_BYTES == 512) -#define scrypt_r_32kb 5 /* (1 << 5) = 32 * 2 blocks in a chunk * 512 bytes = Max of 32kb in a chunk */ -#endif -#define scrypt_maxr scrypt_r_32kb /* 32kb */ -#define scrypt_maxp 25 /* (1 << 25) = ~33 million */ - -#include -#include - -static void -scrypt_fatal_error_default(const char *msg) { - fprintf(stderr, "%s\n", msg); - exit(1); -} - -static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default; - -void -scrypt_set_fatal_error_default(scrypt_fatal_errorfn fn) { - scrypt_fatal_error = fn; -} - -static int -scrypt_power_on_self_test() { - const scrypt_test_setting *t; - uint8_t test_digest[64]; - uint32_t i; - int res = 7, scrypt_valid; - - if (!scrypt_test_mix()) { -#if !defined(SCRYPT_TEST) - scrypt_fatal_error("scrypt: mix function power-on-self-test failed"); -#endif - res &= ~1; - } - - if (!scrypt_test_hash()) { -#if !defined(SCRYPT_TEST) - scrypt_fatal_error("scrypt: hash function power-on-self-test failed"); -#endif - res &= ~2; - } - - for (i = 0, scrypt_valid = 1; post_settings[i].pw; i++) { - t = post_settings + i; - scrypt((uint8_t *)t->pw, strlen(t->pw), (uint8_t *)t->salt, strlen(t->salt), t->Nfactor, t->rfactor, t->pfactor, test_digest, sizeof(test_digest)); - scrypt_valid &= scrypt_verify(post_vectors[i], test_digest, sizeof(test_digest)); - } - - if (!scrypt_valid) { -#if !defined(SCRYPT_TEST) - scrypt_fatal_error("scrypt: scrypt power-on-self-test failed"); -#endif - res &= ~4; - } - - return res; -} - -typedef struct scrypt_aligned_alloc_t { - uint8_t *mem, *ptr; -} scrypt_aligned_alloc; - -#if defined(SCRYPT_TEST_SPEED) -static uint8_t *mem_base = (uint8_t *)0; -static size_t mem_bump = 0; - -/* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */ -static scrypt_aligned_alloc -scrypt_alloc(uint64_t size) { - scrypt_aligned_alloc aa; - if (!mem_base) { - mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1)); - if (!mem_base) - scrypt_fatal_error("scrypt: out of memory"); - mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); - } - aa.mem = mem_base + mem_bump; - aa.ptr = aa.mem; - mem_bump += (size_t)size; - return aa; -} - -static void -scrypt_free(scrypt_aligned_alloc *aa) { - mem_bump = 0; -} -#else -static scrypt_aligned_alloc -scrypt_alloc(uint64_t size) { - static const size_t max_alloc = (size_t)-1; - scrypt_aligned_alloc aa; - size += (SCRYPT_BLOCK_BYTES - 1); - if (size > max_alloc) - scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory"); - aa.mem = (uint8_t *)malloc((size_t)size); - aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); - if (!aa.mem) - scrypt_fatal_error("scrypt: out of memory"); - return aa; -} - -static void -scrypt_free(scrypt_aligned_alloc *aa) { - free(aa->mem); -} -#endif - - -void -scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t Nfactor, uint8_t rfactor, uint8_t pfactor, uint8_t *out, size_t bytes) { - scrypt_aligned_alloc YX, V; - uint8_t *X, *Y; - uint32_t N, r, p, chunk_bytes, i; - -#if !defined(SCRYPT_CHOOSE_COMPILETIME) - scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); -#endif - -#if !defined(SCRYPT_TEST) - static int power_on_self_test = 0; - if (!power_on_self_test) { - power_on_self_test = 1; - if (!scrypt_power_on_self_test()) - scrypt_fatal_error("scrypt: power on self test failed"); - } -#endif - - if (Nfactor > scrypt_maxN) - scrypt_fatal_error("scrypt: N out of range"); - if (rfactor > scrypt_maxr) - scrypt_fatal_error("scrypt: r out of range"); - if (pfactor > scrypt_maxp) - scrypt_fatal_error("scrypt: p out of range"); - - N = (1 << (Nfactor + 1)); - r = (1 << rfactor); - p = (1 << pfactor); - - chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2; - V = scrypt_alloc((uint64_t)N * chunk_bytes); - YX = scrypt_alloc((p + 1) * chunk_bytes); - - /* 1: X = PBKDF2(password, salt) */ - Y = YX.ptr; - X = Y + chunk_bytes; - scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes * p); - - /* 2: X = ROMix(X) */ - for (i = 0; i < p; i++) - scrypt_ROMix((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, N, r); - - /* 3: Out = PBKDF2(password, X) */ - scrypt_pbkdf2(password, password_len, X, chunk_bytes * p, 1, out, bytes); - - scrypt_ensure_zero(YX.ptr, (p + 1) * chunk_bytes); - - scrypt_free(&V); - scrypt_free(&YX); -} - - -// yacoin: increasing Nfactor gradually -const unsigned char minNfactor = 4; -const unsigned char maxNfactor = 30; - -unsigned char GetNfactor(unsigned int nTimestamp) { - int l = 0; - - if (nTimestamp <= 1367991200) - return 4; - - unsigned long int s = nTimestamp - 1367991200; - while ((s >> 1) > 3) { - l += 1; - s >>= 1; - } - - s &= 3; - - int n = (l * 170 + s * 25 - 2320) / 100; - - if (n < 0) n = 0; - - if (n > 255) - printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n); - - unsigned char N = (unsigned char)n; - //printf("GetNfactor: %d -> %d %d : %d / %d\n", nTimestamp - nChainStartTime, l, s, n, min(max(N, minNfactor), maxNfactor)); - -// return min(max(N, minNfactor), maxNfactor); - - if(NmaxNfactor) return maxNfactor; - return N; -} - -int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t data[20], hash[8], target_swap[8]; - volatile unsigned char *hashc = (unsigned char *) hash; - volatile unsigned char *datac = (unsigned char *) data; - volatile unsigned char *pdatac = (unsigned char *) pdata; - uint32_t n = pdata[19] - 1; - int i; - - /* byte swap it */ - for(int z=0;z<20;z++) { - datac[(z*4) ] = pdatac[(z*4)+3]; - datac[(z*4)+1] = pdatac[(z*4)+2]; - datac[(z*4)+2] = pdatac[(z*4)+1]; - datac[(z*4)+3] = pdatac[(z*4) ]; - } - - int nfactor = GetNfactor(data[17]); - - do { - data[19] = ++n; - - scrypt((unsigned char *)data, 80, - (unsigned char *)data, 80, - nfactor, 0, 0, (unsigned char *)hash, 32); - - if (hashc[31] == 0 && hashc[30] == 0) { -/* - for(int z=7;z>=0;z--) - fprintf(stderr, "%08x ", hash[z]); - fprintf(stderr, "\n"); - - for(int z=7;z>=0;z--) - fprintf(stderr, "%08x ", ptarget[z]); - fprintf(stderr, "\n"); -*/ - if(fulltest(hash, ptarget)) { - *hashes_done = n - pdata[19] + 1; - pdatac[76] = datac[79]; - pdatac[77] = datac[78]; - pdatac[78] = datac[77]; - pdatac[79] = datac[76]; - return 1; - } - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - pdata[19] + 1; - pdata[19] = n; - return 0; -} diff --git a/scrypt-jane.h b/scrypt-jane.h deleted file mode 100644 index 1c0df6242..000000000 --- a/scrypt-jane.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef SCRYPT_JANE_H -#define SCRYPT_JANE_H - -/* - Nfactor: Increases CPU & Memory Hardness - N = (1 << (Nfactor + 1)): How many times to mix a chunk and how many temporary chunks are used - - rfactor: Increases Memory Hardness - r = (1 << rfactor): How large a chunk is - - pfactor: Increases CPU Hardness - p = (1 << pfactor): Number of times to mix the main chunk - - A block is the basic mixing unit (salsa/chacha block = 64 bytes) - A chunk is (2 * r) blocks - - ~Memory used = (N + 2) * ((2 * r) * block size) -*/ - -#include - -typedef void (*scrypt_fatal_errorfn)(const char *msg); -void scrypt_set_fatal_error(scrypt_fatal_errorfn fn); - -void scrypt(const unsigned char *password, size_t password_len, const unsigned char *salt, size_t salt_len, unsigned char Nfactor, unsigned char rfactor, unsigned char pfactor, unsigned char *out, size_t bytes); - -#endif /* SCRYPT_JANE_H */ diff --git a/scrypt-jane/README.md b/scrypt-jane/README.md new file mode 100644 index 000000000..2b6976684 --- /dev/null +++ b/scrypt-jane/README.md @@ -0,0 +1,161 @@ +This project provides a performant, flexible implementations of Colin Percival's [scrypt](http://www.tarsnap.com/scrypt.html). + +# Features + +## Modular Design + +The code uses a modular (compile, not runtime) layout to allow new mixing & hash functions to be added easily. The base components (HMAC, PBKDF2, and scrypt) are static and will immediately work with any conforming mix or hash function. + +## Supported Mix Functions + +* [Salsa20/8](http://cr.yp.to/salsa20.html) +* [ChaCha20/8](http://cr.yp.to/chacha.html) +* [Salsa6420/8]() + +I am not actually aware of any other candidates for a decent mix function. Salsa20/8 was nearly perfect, but its successor, ChaCha20/8, has better diffusion and is thus stronger, is potentially faster given advanced SIMD support (byte level shuffles, or a 32bit rotate), and is slightly cleaner to implement given that it requires no pre/post processing of data for SIMD implementations. + +64-byte blocks are no longer assumed! Salsa6420/8 is a 'proof of concept' 64-bit version of Salsa20/8 with a 128 byte block, and rotation constants chosen to allow 32-bit word shuffles instead of rotations for two of the rotations which put it on par with ChaCha in terms of SSE implementation shortcuts. + +## Supported Hash Functions + +* SHA256/512 +* [BLAKE256/512](https://www.131002.net/blake/) +* [Skein512](http://www.skein-hash.info/) +* [Keccak256/512](http://keccak.noekeon.org/) (SHA-3) + +Hash function implementations, unlike mix functions, are not optimized. The PBKDF2 computations are relatively minor in the scrypt algorithm, so including CPU specific versions, or vastly unrolling loops, would serve little purpose while bloating the code, both source and binary, and making it more confusing to implement correctly. + +Most (now only two!) of the SHA-3 candidates fall in to the "annoying to read/implement" category and have not been included yet. This will of course be moot once ~~BLAKE is chosen as SHA-3~~ Keccak is chosen as SHA-3. Well shit. + +## CPU Adaptation + +The mixing function specialization is selected at runtime based on what the CPU supports (well, x86/x86-64 for now, but theoretically any). On platforms where this is not needed, e.g. where packages are usually compiled from source, it can also select the most suitable implementation at compile time, cutting down on binary size. + +For those who are familiar with the scrypt spec, the code specializes at the ROMix level, allowing all copy, and xor calls to be inlined efficiently. ***Update***: This is actually not as important as I switched from specializing at the mix() level and letting the compiler somewhat inefficiently inline block_copy and block_xor to specializing at ChunkMix(), where they can be inlined properly. I thought about specializing at ROMix(), but it would increase the complexity per mix function even more and would not present many more opportunities than what is generated by the compiler presently. + +MSVC uses SSE intrinsics as opposed to inline assembly for the mix functions to allow the compiler to fully inline properly. Also, Visual Studio is not smart enough to allow inline assembly in 64-bit code. + +## Self Testing + +On first use, scrypt() runs a small series of tests to make sure the hash function, mix functions, and scrypt() itself, are generating correct results. It will exit() (or call a user defined fatal error function) should any of these tests fail. + +Test vectors for individual mix and hash functions are generated from reference implementations. The only "official" test vectors for the full scrypt() are for SHA256 + Salsa20/8 of course; other combinations are generated from this code (once it works with all reference test vectors) and subject to change if any implementation errors are discovered. + +# Performance (on an E5200 2.5GHZ) + +Benchmarks are run _without_ allocating memory, i.e. allocating enough memory before the trials are run. Different allocators can have different costs and non-deterministic effects, which is not the point of comparing implementations. The only hash function compared will be SHA-256 to be comparable to Colin's reference implementation, and the hash function will generally be a fraction of a percent of noise in the overall result. + +Three different scrypt settings are tested (the last two are from the scrypt paper): + +* 'High Volume': N=4096, r=8, p=1, 4mb memory +* 'Interactive': N=16384, r=8, p=1, 16mb memory +* 'Non-Interactive': N=1048576, r=8, p=1, 1gb memory + +Cycle counts are in millions of cycles. All versions compiled with gcc 4.6.3, -O3. Sorted from fastest to slowest. + +Scaling refers to how much more expensive 'Non-Interactive' is to compute than 'High Volume', normalized to "ideal" scaling (256x difficulty). Under 100% means it becomes easier to process as N grows, over 100% means it becomes more difficult to process as N grows. + + + + + + + + + + + + + + + + + +
ImplemenationAlgoHigh VolumeInteractiveNon-InteractiveScaling
scrypt-jane SSSE3 64bitSalsa6420/8 18.2m 75.6m5120.0m110.0%
scrypt-jane SSSE3 64bitChaCha20/8 19.6m 79.6m5296.7m105.6%
scrypt-jane SSSE3 32bitChaCha20/8 19.8m 80.3m5346.1m105.5%
scrypt-jane SSE2 64bit Salsa6420/8 19.8m 82.1m5529.2m109.1%
scrypt-jane SSE2 64bit Salsa20/8 22.1m 89.7m5938.8m105.0%
scrypt-jane SSE2 32bit Salsa20/8 22.3m 90.6m6011.0m105.3%
scrypt-jane SSE2 64bit ChaCha20/8 23.9m 96.8m6399.7m104.6%
scrypt-jane SSE2 32bit ChaCha20/8 24.2m 98.3m6500.7m104.9%
*Reference SSE2 64bit* Salsa20/8 32.9m135.2m8881.6m105.5%
*Reference SSE2 32bit* Salsa20/8 33.0m134.4m8885.2m105.2%
+ +* scrypt-jane Salsa6420/8-SSSE3 is ~1.80x faster than reference Salsa20/8-SSE2 for High Volume, but drops to 1.73x faster for 'Non-Interactive' instead of remaining constant +* scrypt-jane ChaCha20/8-SSSE3 is ~1.67x faster than reference Salsa20/8-SSE2 +* scrypt-jane Salsa20/8-SSE2 is ~1.48x faster than reference Salsa20/8-SSE2 + +# Performance (on a slightly noisy E3-1270 3.4GHZ) + +All versions compiled with gcc 4.4.7, -O3. Sorted from fastest to slowest. + + + + + + + + + + + + + + + + + + + + + + +
ImplemenationAlgoHigh VolumeInteractiveNon-InteractiveScaling
scrypt-jane AVX 64bit Salsa6420/8 11.8m 52.5m3848.6m127.4%
scrypt-jane SSSE3 64bit Salsa6420/8 13.3m 57.9m4176.6m122.7%
scrypt-jane SSE2 64bit Salsa6420/8 14.2m 61.1m4382.4m120.6%
scrypt-jane AVX 64bit ChaCha20/8 18.0m 77.4m5396.8m117.1%
scrypt-jane AVX 32bit ChaCha20/8 18.3m 82.1m5421.8m115.7%
scrypt-jane SSSE3 64bit ChaCha20/8 19.0m 81.3m5600.7m115.1%
scrypt-jane AVX 64bit Salsa20/8 19.0m 81.2m5610.6m115.3%
scrypt-jane AVX 32bit Salsa20/8 19.0m 81.3m5621.6m115.6%
scrypt-jane SSSE3 32bit ChaCha20/8 19.1m 81.8m5621.6m115.0%
scrypt-jane SSE2 64bit Salsa20/8 19.5m 83.8m5772.9m115.6%
scrypt-jane SSE2 32bit Salsa20/8 19.6m 84.0m5793.9m115.5%
*Reference SSE2/AVX 64bit* Salsa20/8 21.5m 90.4m6147.1m111.7%
*Reference SSE2/AVX 32bit* Salsa20/8 22.3m 94.0m6267.7m110.0%
scrypt-jane SSE2 64bit ChaCha20/8 23.1m 97.7m6670.0m112.8%
scrypt-jane SSE2 32bit ChaCha20/8 23.3m 98.4m6728.7m112.8%
*Reference SSE2 64bit* Salsa20/8 30.4m125.6m8139.4m104.6%
*Reference SSE2 32bit* Salsa20/8 30.0m124.5m8469.3m110.3%
+ +* scrypt-jane Salsa6420/8-AVX is 1.60x - 1.82x faster than reference Salsa20/8-SSE2/AVX +* scrypt-jane ChaCha20/8-AVX is 1.13x - 1.19x faster than reference Salsa20/8-SSE2/AVX +* scrypt-jane Salsa20/8-AVX is 1.09x - 1.13x faster than reference Salsa20/8-SSE2/AVX + + +# Building + + [gcc,icc,clang] scrypt-jane.c -O3 -[m32,m64] -DSCRYPT_MIX -DSCRYPT_HASH -c + +where SCRYPT_MIX is one of + +* SCRYPT_SALSA +* SCRYPT_SALSA64 (no optimized 32-bit implementation) +* SCRYPT_CHACHA + +and SCRYPT_HASH is one of + +* SCRYPT_SHA256 +* SCRYPT_SHA512 +* SCRYPT_BLAKE256 +* SCRYPT_BLAKE512 +* SCRYPT_SKEIN512 +* SCRYPT_KECCAK256 +* SCRYPT_KECCAK512 + +e.g. + + gcc scrypt-jane.c -O3 -DSCRYPT_CHACHA -DSCRYPT_BLAKE512 -c + gcc example.c scrypt-jane.o -o example + +clang *may* need "-no-integrated-as" as some? versions don't support ".intel_syntax" + +# Using + + #include "scrypt-jane.h" + + scrypt(password, password_len, salt, salt_len, Nfactor, pfactor, rfactor, out, want_bytes); + +## scrypt parameters + +* Nfactor: Increases CPU & Memory Hardness +* rfactor: Increases Memory Hardness +* pfactor: Increases CPU Hardness + +In scrypt terms + +* N = (1 << (Nfactor + 1)), which controls how many times to mix each chunk, and how many temporary chunks are used. Increasing N increases both CPU time and memory used. +* r = (1 << rfactor), which controls how many blocks are in a chunk (i.e., 2 * r blocks are in a chunk). Increasing r increases how much memory is used. +* p = (1 << pfactor), which controls how many passes to perform over the set of N chunks. Increasing p increases CPU time used. + +I chose to use the log2 of each parameter as it is the common way to communicate settings (e.g. 2^20, not 1048576). + +# License + +Public Domain, or MIT \ No newline at end of file diff --git a/scrypt-jane/code/scrypt-conf.h b/scrypt-jane/code/scrypt-conf.h new file mode 100644 index 000000000..46685a518 --- /dev/null +++ b/scrypt-jane/code/scrypt-conf.h @@ -0,0 +1,28 @@ +/* + pick the best algo at runtime or compile time? + ---------------------------------------------- + SCRYPT_CHOOSE_COMPILETIME (gcc only!) + SCRYPT_CHOOSE_RUNTIME +*/ +#define SCRYPT_CHOOSE_RUNTIME + + +/* + hash function to use + ------------------------------- + SCRYPT_BLAKE256 + SCRYPT_BLAKE512 + SCRYPT_SHA256 + SCRYPT_SHA512 + SCRYPT_SKEIN512 +*/ +//#define SCRYPT_SHA256 + + +/* + block mixer to use + ----------------------------- + SCRYPT_CHACHA + SCRYPT_SALSA +*/ +//#define SCRYPT_SALSA diff --git a/code/scrypt-jane-chacha.h b/scrypt-jane/code/scrypt-jane-chacha.h similarity index 95% rename from code/scrypt-jane-chacha.h rename to scrypt-jane/code/scrypt-jane-chacha.h index 41d96e5ee..7d1f11e5b 100644 --- a/code/scrypt-jane-chacha.h +++ b/scrypt-jane/code/scrypt-jane-chacha.h @@ -81,17 +81,21 @@ scrypt_getROMix() { #if defined(SCRYPT_TEST_SPEED) static size_t available_implementations() { + size_t cpuflags = detect_cpu(); size_t flags = 0; #if defined(SCRYPT_CHACHA_AVX) - flags |= cpu_avx; + if (cpuflags & cpu_avx) + flags |= cpu_avx; #endif #if defined(SCRYPT_CHACHA_SSSE3) - flags |= cpu_ssse3; + if (cpuflags & cpu_ssse3) + flags |= cpu_ssse3; #endif #if defined(SCRYPT_CHACHA_SSE2) + if (cpuflags & cpu_sse2) flags |= cpu_sse2; #endif diff --git a/code/scrypt-jane-hash.h b/scrypt-jane/code/scrypt-jane-hash.h similarity index 100% rename from code/scrypt-jane-hash.h rename to scrypt-jane/code/scrypt-jane-hash.h diff --git a/scrypt-jane/code/scrypt-jane-hash_blake256.h b/scrypt-jane/code/scrypt-jane-hash_blake256.h new file mode 100644 index 000000000..4690b1144 --- /dev/null +++ b/scrypt-jane/code/scrypt-jane-hash_blake256.h @@ -0,0 +1,177 @@ +#define SCRYPT_HASH "BLAKE-256" +#define SCRYPT_HASH_BLOCK_SIZE 64 +#define SCRYPT_HASH_DIGEST_SIZE 32 + +typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +const uint8_t blake256_sigma[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, + 14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3, + 11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4, + 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8, + 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13, + 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9, + 12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11, + 13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10, + 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5, + 10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0, +}; + +const uint32_t blake256_constants[16] = { + 0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89, + 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917 +}; + +typedef struct scrypt_hash_state_t { + uint32_t H[8], T[2]; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} scrypt_hash_state; + +static void +blake256_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) { + const uint8_t *sigma, *sigma_end = blake256_sigma + (10 * 16); + uint32_t m[16], v[16], h[8], t[2]; + uint32_t i; + + for (i = 0; i < 8; i++) h[i] = S->H[i]; + for (i = 0; i < 2; i++) t[i] = S->T[i]; + + while (blocks--) { + t[0] += 512; + t[1] += (t[0] < 512) ? 1 : 0; + + for (i = 0; i < 8; i++) v[i ] = h[i]; + for (i = 0; i < 4; i++) v[i + 8] = blake256_constants[i]; + for (i = 0; i < 2; i++) v[i + 12] = blake256_constants[i+4] ^ t[0]; + for (i = 0; i < 2; i++) v[i + 14] = blake256_constants[i+6] ^ t[1]; + + for (i = 0; i < 16; i++) m[i] = U8TO32_BE(&in[i * 4]); + in += 64; + + #define G(a,b,c,d,e) \ + v[a] += (m[sigma[e+0]] ^ blake256_constants[sigma[e+1]]) + v[b]; \ + v[d] = ROTR32(v[d] ^ v[a],16); \ + v[c] += v[d]; \ + v[b] = ROTR32(v[b] ^ v[c],12); \ + v[a] += (m[sigma[e+1]] ^ blake256_constants[sigma[e+0]]) + v[b]; \ + v[d] = ROTR32(v[d] ^ v[a], 8); \ + v[c] += v[d]; \ + v[b] = ROTR32(v[b] ^ v[c], 7); + + for (i = 0, sigma = blake256_sigma; i < 14; i++) { + G(0, 4, 8,12, 0); + G(1, 5, 9,13, 2); + G(2, 6,10,14, 4); + G(3, 7,11,15, 6); + + G(0, 5,10,15, 8); + G(1, 6,11,12,10); + G(2, 7, 8,13,12); + G(3, 4, 9,14,14); + + sigma += 16; + if (sigma == sigma_end) + sigma = blake256_sigma; + } + + #undef G + + for (i = 0; i < 8; i++) h[i] ^= (v[i] ^ v[i + 8]); + } + + for (i = 0; i < 8; i++) S->H[i] = h[i]; + for (i = 0; i < 2; i++) S->T[i] = t[i]; +} + +static void +scrypt_hash_init(scrypt_hash_state *S) { + S->H[0] = 0x6a09e667ULL; + S->H[1] = 0xbb67ae85ULL; + S->H[2] = 0x3c6ef372ULL; + S->H[3] = 0xa54ff53aULL; + S->H[4] = 0x510e527fULL; + S->H[5] = 0x9b05688cULL; + S->H[6] = 0x1f83d9abULL; + S->H[7] = 0x5be0cd19ULL; + S->T[0] = 0; + S->T[1] = 0; + S->leftover = 0; +} + +static void +scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { + size_t blocks, want; + + /* handle the previous data */ + if (S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (want < inlen) ? want : inlen; + memcpy(S->buffer + S->leftover, in, want); + S->leftover += (uint32_t)want; + if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) + return; + in += want; + inlen -= want; + blake256_blocks(S, S->buffer, 1); + } + + /* handle the current data */ + blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); + S->leftover = (uint32_t)(inlen - blocks); + if (blocks) { + blake256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); + in += blocks; + } + + /* handle leftover data */ + if (S->leftover) + memcpy(S->buffer, in, S->leftover); +} + +static void +scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { + uint32_t th, tl, bits; + + bits = (S->leftover << 3); + tl = S->T[0] + bits; + th = S->T[1]; + if (S->leftover == 0) { + S->T[0] = (uint32_t)0 - (uint32_t)512; + S->T[1] = (uint32_t)0 - (uint32_t)1; + } else if (S->T[0] == 0) { + S->T[0] = ((uint32_t)0 - (uint32_t)512) + bits; + S->T[1] = S->T[1] - 1; + } else { + S->T[0] -= (512 - bits); + } + + S->buffer[S->leftover] = 0x80; + if (S->leftover <= 55) { + memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover); + } else { + memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover); + blake256_blocks(S, S->buffer, 1); + S->T[0] = (uint32_t)0 - (uint32_t)512; + S->T[1] = (uint32_t)0 - (uint32_t)1; + memset(S->buffer, 0, 56); + } + S->buffer[55] |= 1; + U32TO8_BE(S->buffer + 56, th); + U32TO8_BE(S->buffer + 60, tl); + blake256_blocks(S, S->buffer, 1); + + U32TO8_BE(&hash[ 0], S->H[0]); + U32TO8_BE(&hash[ 4], S->H[1]); + U32TO8_BE(&hash[ 8], S->H[2]); + U32TO8_BE(&hash[12], S->H[3]); + U32TO8_BE(&hash[16], S->H[4]); + U32TO8_BE(&hash[20], S->H[5]); + U32TO8_BE(&hash[24], S->H[6]); + U32TO8_BE(&hash[28], S->H[7]); +} + +static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { + 0xcc,0xa9,0x1e,0xa9,0x20,0x97,0x37,0x40,0x17,0xc0,0xa0,0x52,0x87,0xfc,0x08,0x20, + 0x40,0xf5,0x81,0x86,0x62,0x75,0x78,0xb2,0x79,0xce,0xde,0x27,0x3c,0x7f,0x85,0xd8, +}; diff --git a/scrypt-jane/code/scrypt-jane-hash_blake512.h b/scrypt-jane/code/scrypt-jane-hash_blake512.h new file mode 100644 index 000000000..ea2a583de --- /dev/null +++ b/scrypt-jane/code/scrypt-jane-hash_blake512.h @@ -0,0 +1,181 @@ +#define SCRYPT_HASH "BLAKE-512" +#define SCRYPT_HASH_BLOCK_SIZE 128 +#define SCRYPT_HASH_DIGEST_SIZE 64 + +typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +const uint8_t blake512_sigma[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, + 14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3, + 11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4, + 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8, + 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13, + 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9, + 12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11, + 13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10, + 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5, + 10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0, +}; + +const uint64_t blake512_constants[16] = { + 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL, + 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL, + 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL, + 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL +}; + +typedef struct scrypt_hash_state_t { + uint64_t H[8], T[2]; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} scrypt_hash_state; + +static void +blake512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) { + const uint8_t *sigma, *sigma_end = blake512_sigma + (10 * 16); + uint64_t m[16], v[16], h[8], t[2]; + uint32_t i; + + for (i = 0; i < 8; i++) h[i] = S->H[i]; + for (i = 0; i < 2; i++) t[i] = S->T[i]; + + while (blocks--) { + t[0] += 1024; + t[1] += (t[0] < 1024) ? 1 : 0; + + for (i = 0; i < 8; i++) v[i ] = h[i]; + for (i = 0; i < 4; i++) v[i + 8] = blake512_constants[i]; + for (i = 0; i < 2; i++) v[i + 12] = blake512_constants[i+4] ^ t[0]; + for (i = 0; i < 2; i++) v[i + 14] = blake512_constants[i+6] ^ t[1]; + + for (i = 0; i < 16; i++) m[i] = U8TO64_BE(&in[i * 8]); + in += 128; + + #define G(a,b,c,d,e) \ + v[a] += (m[sigma[e+0]] ^ blake512_constants[sigma[e+1]]) + v[b]; \ + v[d] = ROTR64(v[d] ^ v[a],32); \ + v[c] += v[d]; \ + v[b] = ROTR64(v[b] ^ v[c],25); \ + v[a] += (m[sigma[e+1]] ^ blake512_constants[sigma[e+0]]) + v[b]; \ + v[d] = ROTR64(v[d] ^ v[a],16); \ + v[c] += v[d]; \ + v[b] = ROTR64(v[b] ^ v[c],11); + + for (i = 0, sigma = blake512_sigma; i < 16; i++) { + G(0, 4, 8,12, 0); + G(1, 5, 9,13, 2); + G(2, 6,10,14, 4); + G(3, 7,11,15, 6); + G(0, 5,10,15, 8); + G(1, 6,11,12,10); + G(2, 7, 8,13,12); + G(3, 4, 9,14,14); + + sigma += 16; + if (sigma == sigma_end) + sigma = blake512_sigma; + } + + #undef G + + for (i = 0; i < 8; i++) h[i] ^= (v[i] ^ v[i + 8]); + } + + for (i = 0; i < 8; i++) S->H[i] = h[i]; + for (i = 0; i < 2; i++) S->T[i] = t[i]; +} + +static void +scrypt_hash_init(scrypt_hash_state *S) { + S->H[0] = 0x6a09e667f3bcc908ULL; + S->H[1] = 0xbb67ae8584caa73bULL; + S->H[2] = 0x3c6ef372fe94f82bULL; + S->H[3] = 0xa54ff53a5f1d36f1ULL; + S->H[4] = 0x510e527fade682d1ULL; + S->H[5] = 0x9b05688c2b3e6c1fULL; + S->H[6] = 0x1f83d9abfb41bd6bULL; + S->H[7] = 0x5be0cd19137e2179ULL; + S->T[0] = 0; + S->T[1] = 0; + S->leftover = 0; +} + +static void +scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { + size_t blocks, want; + + /* handle the previous data */ + if (S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (want < inlen) ? want : inlen; + memcpy(S->buffer + S->leftover, in, want); + S->leftover += (uint32_t)want; + if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) + return; + in += want; + inlen -= want; + blake512_blocks(S, S->buffer, 1); + } + + /* handle the current data */ + blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); + S->leftover = (uint32_t)(inlen - blocks); + if (blocks) { + blake512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); + in += blocks; + } + + /* handle leftover data */ + if (S->leftover) + memcpy(S->buffer, in, S->leftover); +} + +static void +scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { + uint64_t th, tl; + size_t bits; + + bits = (S->leftover << 3); + tl = S->T[0] + bits; + th = S->T[1]; + if (S->leftover == 0) { + S->T[0] = (uint64_t)0 - (uint64_t)1024; + S->T[1] = (uint64_t)0 - (uint64_t)1; + } else if (S->T[0] == 0) { + S->T[0] = ((uint64_t)0 - (uint64_t)1024) + bits; + S->T[1] = S->T[1] - 1; + } else { + S->T[0] -= (1024 - bits); + } + + S->buffer[S->leftover] = 0x80; + if (S->leftover <= 111) { + memset(S->buffer + S->leftover + 1, 0, 111 - S->leftover); + } else { + memset(S->buffer + S->leftover + 1, 0, 127 - S->leftover); + blake512_blocks(S, S->buffer, 1); + S->T[0] = (uint64_t)0 - (uint64_t)1024; + S->T[1] = (uint64_t)0 - (uint64_t)1; + memset(S->buffer, 0, 112); + } + S->buffer[111] |= 1; + U64TO8_BE(S->buffer + 112, th); + U64TO8_BE(S->buffer + 120, tl); + blake512_blocks(S, S->buffer, 1); + + U64TO8_BE(&hash[ 0], S->H[0]); + U64TO8_BE(&hash[ 8], S->H[1]); + U64TO8_BE(&hash[16], S->H[2]); + U64TO8_BE(&hash[24], S->H[3]); + U64TO8_BE(&hash[32], S->H[4]); + U64TO8_BE(&hash[40], S->H[5]); + U64TO8_BE(&hash[48], S->H[6]); + U64TO8_BE(&hash[56], S->H[7]); +} + +static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { + 0x2f,0x9d,0x5b,0xbe,0x24,0x0d,0x63,0xd3,0xa0,0xac,0x4f,0xd3,0x01,0xc0,0x23,0x6f, + 0x6d,0xdf,0x6e,0xfb,0x60,0x6f,0xa0,0x74,0xdf,0x9f,0x25,0x65,0xb6,0x11,0x0a,0x83, + 0x23,0x96,0xba,0x91,0x68,0x4b,0x85,0x15,0x13,0x54,0xba,0x19,0xf3,0x2c,0x5a,0x4a, + 0x1f,0x78,0x31,0x02,0xc9,0x1e,0x56,0xc4,0x54,0xca,0xf9,0x8f,0x2c,0x7f,0x85,0xac +}; diff --git a/code/scrypt-jane-hash_keccak.h b/scrypt-jane/code/scrypt-jane-hash_keccak.h similarity index 100% rename from code/scrypt-jane-hash_keccak.h rename to scrypt-jane/code/scrypt-jane-hash_keccak.h diff --git a/code/scrypt-jane-hash_sha256.h b/scrypt-jane/code/scrypt-jane-hash_sha256.h similarity index 100% rename from code/scrypt-jane-hash_sha256.h rename to scrypt-jane/code/scrypt-jane-hash_sha256.h diff --git a/scrypt-jane/code/scrypt-jane-hash_sha512.h b/scrypt-jane/code/scrypt-jane-hash_sha512.h new file mode 100644 index 000000000..3e3997d00 --- /dev/null +++ b/scrypt-jane/code/scrypt-jane-hash_sha512.h @@ -0,0 +1,152 @@ +#define SCRYPT_HASH "SHA-2-512" +#define SCRYPT_HASH_BLOCK_SIZE 128 +#define SCRYPT_HASH_DIGEST_SIZE 64 + +typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +typedef struct scrypt_hash_state_t { + uint64_t H[8]; + uint64_t T[2]; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} scrypt_hash_state; + +static const uint64_t sha512_constants[80] = { + 0x428a2f98d728ae22ull, 0x7137449123ef65cdull, 0xb5c0fbcfec4d3b2full, 0xe9b5dba58189dbbcull, + 0x3956c25bf348b538ull, 0x59f111f1b605d019ull, 0x923f82a4af194f9bull, 0xab1c5ed5da6d8118ull, + 0xd807aa98a3030242ull, 0x12835b0145706fbeull, 0x243185be4ee4b28cull, 0x550c7dc3d5ffb4e2ull, + 0x72be5d74f27b896full, 0x80deb1fe3b1696b1ull, 0x9bdc06a725c71235ull, 0xc19bf174cf692694ull, + 0xe49b69c19ef14ad2ull, 0xefbe4786384f25e3ull, 0x0fc19dc68b8cd5b5ull, 0x240ca1cc77ac9c65ull, + 0x2de92c6f592b0275ull, 0x4a7484aa6ea6e483ull, 0x5cb0a9dcbd41fbd4ull, 0x76f988da831153b5ull, + 0x983e5152ee66dfabull, 0xa831c66d2db43210ull, 0xb00327c898fb213full, 0xbf597fc7beef0ee4ull, + 0xc6e00bf33da88fc2ull, 0xd5a79147930aa725ull, 0x06ca6351e003826full, 0x142929670a0e6e70ull, + 0x27b70a8546d22ffcull, 0x2e1b21385c26c926ull, 0x4d2c6dfc5ac42aedull, 0x53380d139d95b3dfull, + 0x650a73548baf63deull, 0x766a0abb3c77b2a8ull, 0x81c2c92e47edaee6ull, 0x92722c851482353bull, + 0xa2bfe8a14cf10364ull, 0xa81a664bbc423001ull, 0xc24b8b70d0f89791ull, 0xc76c51a30654be30ull, + 0xd192e819d6ef5218ull, 0xd69906245565a910ull, 0xf40e35855771202aull, 0x106aa07032bbd1b8ull, + 0x19a4c116b8d2d0c8ull, 0x1e376c085141ab53ull, 0x2748774cdf8eeb99ull, 0x34b0bcb5e19b48a8ull, + 0x391c0cb3c5c95a63ull, 0x4ed8aa4ae3418acbull, 0x5b9cca4f7763e373ull, 0x682e6ff3d6b2b8a3ull, + 0x748f82ee5defb2fcull, 0x78a5636f43172f60ull, 0x84c87814a1f0ab72ull, 0x8cc702081a6439ecull, + 0x90befffa23631e28ull, 0xa4506cebde82bde9ull, 0xbef9a3f7b2c67915ull, 0xc67178f2e372532bull, + 0xca273eceea26619cull, 0xd186b8c721c0c207ull, 0xeada7dd6cde0eb1eull, 0xf57d4f7fee6ed178ull, + 0x06f067aa72176fbaull, 0x0a637dc5a2c898a6ull, 0x113f9804bef90daeull, 0x1b710b35131c471bull, + 0x28db77f523047d84ull, 0x32caab7b40c72493ull, 0x3c9ebe0a15c9bebcull, 0x431d67c49c100d4cull, + 0x4cc5d4becb3e42b6ull, 0x597f299cfc657e2aull, 0x5fcb6fab3ad6faecull, 0x6c44198c4a475817ull +}; + +#define Ch(x,y,z) (z ^ (x & (y ^ z))) +#define Maj(x,y,z) (((x | y) & z) | (x & y)) +#define S0(x) (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39)) +#define S1(x) (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41)) +#define G0(x) (ROTR64(x, 1) ^ ROTR64(x, 8) ^ (x >> 7)) +#define G1(x) (ROTR64(x, 19) ^ ROTR64(x, 61) ^ (x >> 6)) +#define W0(in,i) (U8TO64_BE(&in[i * 8])) +#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16]) +#define STEP(i) \ + t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \ + t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha512_constants[i] + w[i]; \ + r[7] = r[6]; \ + r[6] = r[5]; \ + r[5] = r[4]; \ + r[4] = r[3] + t0; \ + r[3] = r[2]; \ + r[2] = r[1]; \ + r[1] = r[0]; \ + r[0] = t0 + t1; + +static void +sha512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) { + uint64_t r[8], w[80], t0, t1; + size_t i; + + for (i = 0; i < 8; i++) r[i] = S->H[i]; + + while (blocks--) { + for (i = 0; i < 16; i++) { w[i] = W0(in, i); } + for (i = 16; i < 80; i++) { w[i] = W1(i); } + for (i = 0; i < 80; i++) { STEP(i); } + for (i = 0; i < 8; i++) { r[i] += S->H[i]; S->H[i] = r[i]; } + S->T[0] += SCRYPT_HASH_BLOCK_SIZE * 8; + S->T[1] += (!S->T[0]) ? 1 : 0; + in += SCRYPT_HASH_BLOCK_SIZE; + } +} + +static void +scrypt_hash_init(scrypt_hash_state *S) { + S->H[0] = 0x6a09e667f3bcc908ull; + S->H[1] = 0xbb67ae8584caa73bull; + S->H[2] = 0x3c6ef372fe94f82bull; + S->H[3] = 0xa54ff53a5f1d36f1ull; + S->H[4] = 0x510e527fade682d1ull; + S->H[5] = 0x9b05688c2b3e6c1full; + S->H[6] = 0x1f83d9abfb41bd6bull; + S->H[7] = 0x5be0cd19137e2179ull; + S->T[0] = 0; + S->T[1] = 0; + S->leftover = 0; +} + +static void +scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { + size_t blocks, want; + + /* handle the previous data */ + if (S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (want < inlen) ? want : inlen; + memcpy(S->buffer + S->leftover, in, want); + S->leftover += (uint32_t)want; + if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) + return; + in += want; + inlen -= want; + sha512_blocks(S, S->buffer, 1); + } + + /* handle the current data */ + blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); + S->leftover = (uint32_t)(inlen - blocks); + if (blocks) { + sha512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); + in += blocks; + } + + /* handle leftover data */ + if (S->leftover) + memcpy(S->buffer, in, S->leftover); +} + +static void +scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { + uint64_t t0 = S->T[0] + (S->leftover * 8), t1 = S->T[1]; + + S->buffer[S->leftover] = 0x80; + if (S->leftover <= 111) { + memset(S->buffer + S->leftover + 1, 0, 111 - S->leftover); + } else { + memset(S->buffer + S->leftover + 1, 0, 127 - S->leftover); + sha512_blocks(S, S->buffer, 1); + memset(S->buffer, 0, 112); + } + + U64TO8_BE(S->buffer + 112, t1); + U64TO8_BE(S->buffer + 120, t0); + sha512_blocks(S, S->buffer, 1); + + U64TO8_BE(&hash[ 0], S->H[0]); + U64TO8_BE(&hash[ 8], S->H[1]); + U64TO8_BE(&hash[16], S->H[2]); + U64TO8_BE(&hash[24], S->H[3]); + U64TO8_BE(&hash[32], S->H[4]); + U64TO8_BE(&hash[40], S->H[5]); + U64TO8_BE(&hash[48], S->H[6]); + U64TO8_BE(&hash[56], S->H[7]); +} + +static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { + 0xba,0xc3,0x80,0x2b,0x24,0x56,0x95,0x1f,0x19,0x7c,0xa2,0xd3,0x72,0x7c,0x9a,0x4d, + 0x1d,0x50,0x3a,0xa9,0x12,0x27,0xd8,0xe1,0xbe,0x76,0x53,0x87,0x5a,0x1e,0x82,0xec, + 0xc8,0xe1,0x6b,0x87,0xd0,0xb5,0x25,0x7e,0xe8,0x1e,0xd7,0x58,0xc6,0x2d,0xc2,0x9c, + 0x06,0x31,0x8f,0x5b,0x57,0x8e,0x76,0xba,0xd5,0xf6,0xec,0xfe,0x85,0x1f,0x34,0x0c, +}; diff --git a/scrypt-jane/code/scrypt-jane-hash_skein512.h b/scrypt-jane/code/scrypt-jane-hash_skein512.h new file mode 100644 index 000000000..736d893de --- /dev/null +++ b/scrypt-jane/code/scrypt-jane-hash_skein512.h @@ -0,0 +1,188 @@ +#define SCRYPT_HASH "Skein-512" +#define SCRYPT_HASH_BLOCK_SIZE 64 +#define SCRYPT_HASH_DIGEST_SIZE 64 + +typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +typedef struct scrypt_hash_state_t { + uint64_t X[8], T[2]; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} scrypt_hash_state; + +#include + +static void +skein512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks, size_t add) { + uint64_t X[8], key[8], Xt[9+18], T[3+1]; + size_t r; + + while (blocks--) { + T[0] = S->T[0] + add; + T[1] = S->T[1]; + T[2] = T[0] ^ T[1]; + key[0] = U8TO64_LE(in + 0); Xt[0] = S->X[0]; X[0] = key[0] + Xt[0]; + key[1] = U8TO64_LE(in + 8); Xt[1] = S->X[1]; X[1] = key[1] + Xt[1]; + key[2] = U8TO64_LE(in + 16); Xt[2] = S->X[2]; X[2] = key[2] + Xt[2]; + key[3] = U8TO64_LE(in + 24); Xt[3] = S->X[3]; X[3] = key[3] + Xt[3]; + key[4] = U8TO64_LE(in + 32); Xt[4] = S->X[4]; X[4] = key[4] + Xt[4]; + key[5] = U8TO64_LE(in + 40); Xt[5] = S->X[5]; X[5] = key[5] + Xt[5] + T[0]; + key[6] = U8TO64_LE(in + 48); Xt[6] = S->X[6]; X[6] = key[6] + Xt[6] + T[1]; + key[7] = U8TO64_LE(in + 56); Xt[7] = S->X[7]; X[7] = key[7] + Xt[7]; + Xt[8] = 0x1BD11BDAA9FC1A22ull ^ Xt[0] ^ Xt[1] ^ Xt[2] ^ Xt[3] ^ Xt[4] ^ Xt[5] ^ Xt[6] ^ Xt[7]; + in += SCRYPT_HASH_BLOCK_SIZE; + + for (r = 0; r < 18; r++) + Xt[r + 9] = Xt[r + 0]; + + for (r = 0; r < 18; r += 2) { + X[0] += X[1]; X[1] = ROTL64(X[1], 46) ^ X[0]; + X[2] += X[3]; X[3] = ROTL64(X[3], 36) ^ X[2]; + X[4] += X[5]; X[5] = ROTL64(X[5], 19) ^ X[4]; + X[6] += X[7]; X[7] = ROTL64(X[7], 37) ^ X[6]; + X[2] += X[1]; X[1] = ROTL64(X[1], 33) ^ X[2]; + X[0] += X[3]; X[3] = ROTL64(X[3], 42) ^ X[0]; + X[6] += X[5]; X[5] = ROTL64(X[5], 14) ^ X[6]; + X[4] += X[7]; X[7] = ROTL64(X[7], 27) ^ X[4]; + X[4] += X[1]; X[1] = ROTL64(X[1], 17) ^ X[4]; + X[6] += X[3]; X[3] = ROTL64(X[3], 49) ^ X[6]; + X[0] += X[5]; X[5] = ROTL64(X[5], 36) ^ X[0]; + X[2] += X[7]; X[7] = ROTL64(X[7], 39) ^ X[2]; + X[6] += X[1]; X[1] = ROTL64(X[1], 44) ^ X[6]; + X[4] += X[3]; X[3] = ROTL64(X[3], 56) ^ X[4]; + X[2] += X[5]; X[5] = ROTL64(X[5], 54) ^ X[2]; + X[0] += X[7]; X[7] = ROTL64(X[7], 9) ^ X[0]; + + X[0] += Xt[r + 1]; + X[1] += Xt[r + 2]; + X[2] += Xt[r + 3]; + X[3] += Xt[r + 4]; + X[4] += Xt[r + 5]; + X[5] += Xt[r + 6] + T[1]; + X[6] += Xt[r + 7] + T[2]; + X[7] += Xt[r + 8] + r + 1; + + T[3] = T[0]; + T[0] = T[1]; + T[1] = T[2]; + T[2] = T[3]; + + X[0] += X[1]; X[1] = ROTL64(X[1], 39) ^ X[0]; + X[2] += X[3]; X[3] = ROTL64(X[3], 30) ^ X[2]; + X[4] += X[5]; X[5] = ROTL64(X[5], 34) ^ X[4]; + X[6] += X[7]; X[7] = ROTL64(X[7], 24) ^ X[6]; + X[2] += X[1]; X[1] = ROTL64(X[1], 13) ^ X[2]; + X[0] += X[3]; X[3] = ROTL64(X[3], 17) ^ X[0]; + X[6] += X[5]; X[5] = ROTL64(X[5], 10) ^ X[6]; + X[4] += X[7]; X[7] = ROTL64(X[7], 50) ^ X[4]; + X[4] += X[1]; X[1] = ROTL64(X[1], 25) ^ X[4]; + X[6] += X[3]; X[3] = ROTL64(X[3], 29) ^ X[6]; + X[0] += X[5]; X[5] = ROTL64(X[5], 39) ^ X[0]; + X[2] += X[7]; X[7] = ROTL64(X[7], 43) ^ X[2]; + X[6] += X[1]; X[1] = ROTL64(X[1], 8) ^ X[6]; + X[4] += X[3]; X[3] = ROTL64(X[3], 22) ^ X[4]; + X[2] += X[5]; X[5] = ROTL64(X[5], 56) ^ X[2]; + X[0] += X[7]; X[7] = ROTL64(X[7], 35) ^ X[0]; + + X[0] += Xt[r + 2]; + X[1] += Xt[r + 3]; + X[2] += Xt[r + 4]; + X[3] += Xt[r + 5]; + X[4] += Xt[r + 6]; + X[5] += Xt[r + 7] + T[1]; + X[6] += Xt[r + 8] + T[2]; + X[7] += Xt[r + 9] + r + 2; + + T[3] = T[0]; + T[0] = T[1]; + T[1] = T[2]; + T[2] = T[3]; + } + + S->X[0] = key[0] ^ X[0]; + S->X[1] = key[1] ^ X[1]; + S->X[2] = key[2] ^ X[2]; + S->X[3] = key[3] ^ X[3]; + S->X[4] = key[4] ^ X[4]; + S->X[5] = key[5] ^ X[5]; + S->X[6] = key[6] ^ X[6]; + S->X[7] = key[7] ^ X[7]; + + S->T[0] = T[0]; + S->T[1] = T[1] & ~0x4000000000000000ull; + } +} + +static void +scrypt_hash_init(scrypt_hash_state *S) { + S->X[0] = 0x4903ADFF749C51CEull; + S->X[1] = 0x0D95DE399746DF03ull; + S->X[2] = 0x8FD1934127C79BCEull; + S->X[3] = 0x9A255629FF352CB1ull; + S->X[4] = 0x5DB62599DF6CA7B0ull; + S->X[5] = 0xEABE394CA9D5C3F4ull; + S->X[6] = 0x991112C71A75B523ull; + S->X[7] = 0xAE18A40B660FCC33ull; + S->T[0] = 0x0000000000000000ull; + S->T[1] = 0x7000000000000000ull; + S->leftover = 0; +} + +static void +scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { + size_t blocks, want; + + /* skein processes the final <=64 bytes raw, so we can only update if there are at least 64+1 bytes available */ + if ((S->leftover + inlen) > SCRYPT_HASH_BLOCK_SIZE) { + /* handle the previous data, we know there is enough for at least one block */ + if (S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + memcpy(S->buffer + S->leftover, in, want); + in += want; + inlen -= want; + S->leftover = 0; + skein512_blocks(S, S->buffer, 1, SCRYPT_HASH_BLOCK_SIZE); + } + + /* handle the current data if there's more than one block */ + if (inlen > SCRYPT_HASH_BLOCK_SIZE) { + blocks = ((inlen - 1) & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); + skein512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE, SCRYPT_HASH_BLOCK_SIZE); + inlen -= blocks; + in += blocks; + } + } + + /* handle leftover data */ + memcpy(S->buffer + S->leftover, in, inlen); + S->leftover += inlen; +} + +static void +scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { + memset(S->buffer + S->leftover, 0, SCRYPT_HASH_BLOCK_SIZE - S->leftover); + S->T[1] |= 0x8000000000000000ull; + skein512_blocks(S, S->buffer, 1, S->leftover); + + memset(S->buffer, 0, SCRYPT_HASH_BLOCK_SIZE); + S->T[0] = 0; + S->T[1] = 0xff00000000000000ull; + skein512_blocks(S, S->buffer, 1, 8); + + U64TO8_LE(&hash[ 0], S->X[0]); + U64TO8_LE(&hash[ 8], S->X[1]); + U64TO8_LE(&hash[16], S->X[2]); + U64TO8_LE(&hash[24], S->X[3]); + U64TO8_LE(&hash[32], S->X[4]); + U64TO8_LE(&hash[40], S->X[5]); + U64TO8_LE(&hash[48], S->X[6]); + U64TO8_LE(&hash[56], S->X[7]); +} + + +static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { + 0x4d,0x52,0x29,0xff,0x10,0xbc,0xd2,0x62,0xd1,0x61,0x83,0xc8,0xe6,0xf0,0x83,0xc4, + 0x9f,0xf5,0x6a,0x42,0x75,0x2a,0x26,0x4e,0xf0,0x28,0x72,0x28,0x47,0xe8,0x23,0xdf, + 0x1e,0x64,0xf1,0x51,0x38,0x35,0x9d,0xc2,0x83,0xfc,0x35,0x4e,0xc0,0x52,0x5f,0x41, + 0x6a,0x0b,0x7d,0xf5,0xce,0x98,0xde,0x6f,0x36,0xd8,0x51,0x15,0x78,0x78,0x93,0x67, +}; diff --git a/code/scrypt-jane-mix_chacha-avx.h b/scrypt-jane/code/scrypt-jane-mix_chacha-avx.h similarity index 91% rename from code/scrypt-jane-mix_chacha-avx.h rename to scrypt-jane/code/scrypt-jane-mix_chacha-avx.h index 50d6e2d2a..ab5ed2031 100644 --- a/code/scrypt-jane-mix_chacha-avx.h +++ b/scrypt-jane/code/scrypt-jane-mix_chacha-avx.h @@ -20,8 +20,28 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(shl edx,6) a2(lea ecx,[edx-64]) a2(and eax, eax) - a2(vmovdqa xmm4,[ssse3_rotl16_32bit]) - a2(vmovdqa xmm5,[ssse3_rotl8_32bit]) + a2(mov ebx, 0x01000302) + a2(vmovd xmm4, ebx) + a2(mov ebx, 0x05040706) + a2(vmovd xmm0, ebx) + a2(mov ebx, 0x09080b0a) + a2(vmovd xmm1, ebx) + a2(mov ebx, 0x0d0c0f0e) + a2(vmovd xmm2, ebx) + a2(mov ebx, 0x02010003) + a2(vmovd xmm5, ebx) + a2(mov ebx, 0x06050407) + a2(vmovd xmm3, ebx) + a2(mov ebx, 0x0a09080b) + a2(vmovd xmm6, ebx) + a2(mov ebx, 0x0e0d0c0f) + a2(vmovd xmm7, ebx) + a3(vpunpckldq xmm4, xmm4, xmm0) + a3(vpunpckldq xmm5, xmm5, xmm3) + a3(vpunpckldq xmm1, xmm1, xmm2) + a3(vpunpckldq xmm6, xmm6, xmm7) + a3(vpunpcklqdq xmm4, xmm4, xmm1) + a3(vpunpcklqdq xmm5, xmm5, xmm6) a2(vmovdqa xmm0,[ecx+esi+0]) a2(vmovdqa xmm1,[ecx+esi+16]) a2(vmovdqa xmm2,[ecx+esi+32]) @@ -114,7 +134,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a1(pop esi) a1(pop edi) a1(pop ebx) - a1(ret 16) + aret(16) asm_naked_fn_end(scrypt_ChunkMix_avx) #endif @@ -134,12 +154,20 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(lea rax,[rsi+r9]) a2(lea r9,[rdx+r9]) a2(and rdx, rdx) - a2(vmovdqa xmm4,[ssse3_rotl16_32bit]) - a2(vmovdqa xmm5,[ssse3_rotl8_32bit]) a2(vmovdqa xmm0,[rax+0]) a2(vmovdqa xmm1,[rax+16]) a2(vmovdqa xmm2,[rax+32]) a2(vmovdqa xmm3,[rax+48]) + a2(mov r8, 0x0504070601000302) + a2(mov rax, 0x0d0c0f0e09080b0a) + a2(movq xmm4, r8) + a2(movq xmm6, rax) + a2(mov r8, 0x0605040702010003) + a2(mov rax, 0x0e0d0c0f0a09080b) + a2(movq xmm5, r8) + a2(movq xmm7, rax) + a3(vpunpcklqdq xmm4, xmm4, xmm6) + a3(vpunpcklqdq xmm5, xmm5, xmm7) a1(jz scrypt_ChunkMix_avx_no_xor1) a3(vpxor xmm0,xmm0,[r9+0]) a3(vpxor xmm1,xmm1,[r9+16]) diff --git a/code/scrypt-jane-mix_chacha-sse2.h b/scrypt-jane/code/scrypt-jane-mix_chacha-sse2.h similarity index 99% rename from code/scrypt-jane-mix_chacha-sse2.h rename to scrypt-jane/code/scrypt-jane-mix_chacha-sse2.h index d2192c8f9..d040121e6 100644 --- a/code/scrypt-jane-mix_chacha-sse2.h +++ b/scrypt-jane/code/scrypt-jane-mix_chacha-sse2.h @@ -128,7 +128,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a1(pop esi) a1(pop edi) a1(pop ebx) - a1(ret 16) + aret(16) asm_naked_fn_end(scrypt_ChunkMix_sse2) #endif diff --git a/code/scrypt-jane-mix_chacha-ssse3.h b/scrypt-jane/code/scrypt-jane-mix_chacha-ssse3.h similarity index 91% rename from code/scrypt-jane-mix_chacha-ssse3.h rename to scrypt-jane/code/scrypt-jane-mix_chacha-ssse3.h index b25e35672..b0609f185 100644 --- a/code/scrypt-jane-mix_chacha-ssse3.h +++ b/scrypt-jane/code/scrypt-jane-mix_chacha-ssse3.h @@ -20,8 +20,28 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(shl edx,6) a2(lea ecx,[edx-64]) a2(and eax, eax) - a2(movdqa xmm4,[ssse3_rotl16_32bit]) - a2(movdqa xmm5,[ssse3_rotl8_32bit]) + a2(mov ebx, 0x01000302) + a2(movd xmm4, ebx) + a2(mov ebx, 0x05040706) + a2(movd xmm0, ebx) + a2(mov ebx, 0x09080b0a) + a2(movd xmm1, ebx) + a2(mov ebx, 0x0d0c0f0e) + a2(movd xmm2, ebx) + a2(mov ebx, 0x02010003) + a2(movd xmm5, ebx) + a2(mov ebx, 0x06050407) + a2(movd xmm3, ebx) + a2(mov ebx, 0x0a09080b) + a2(movd xmm6, ebx) + a2(mov ebx, 0x0e0d0c0f) + a2(movd xmm7, ebx) + a2(punpckldq xmm4, xmm0) + a2(punpckldq xmm5, xmm3) + a2(punpckldq xmm1, xmm2) + a2(punpckldq xmm6, xmm7) + a2(punpcklqdq xmm4, xmm1) + a2(punpcklqdq xmm5, xmm6) a2(movdqa xmm0,[ecx+esi+0]) a2(movdqa xmm1,[ecx+esi+16]) a2(movdqa xmm2,[ecx+esi+32]) @@ -118,7 +138,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a1(pop esi) a1(pop edi) a1(pop ebx) - a1(ret 16) + aret(16) asm_naked_fn_end(scrypt_ChunkMix_ssse3) #endif @@ -138,12 +158,20 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(lea rax,[rsi+r9]) a2(lea r9,[rdx+r9]) a2(and rdx, rdx) - a2(movdqa xmm4,[ssse3_rotl16_32bit]) - a2(movdqa xmm5,[ssse3_rotl8_32bit]) a2(movdqa xmm0,[rax+0]) a2(movdqa xmm1,[rax+16]) a2(movdqa xmm2,[rax+32]) a2(movdqa xmm3,[rax+48]) + a2(mov r8, 0x0504070601000302) + a2(mov rax, 0x0d0c0f0e09080b0a) + a2(movq xmm4, r8) + a2(movq xmm6, rax) + a2(mov r8, 0x0605040702010003) + a2(mov rax, 0x0e0d0c0f0a09080b) + a2(movq xmm5, r8) + a2(movq xmm7, rax) + a2(punpcklqdq xmm4, xmm6) + a2(punpcklqdq xmm5, xmm7) a1(jz scrypt_ChunkMix_ssse3_no_xor1) a2(pxor xmm0,[r9+0]) a2(pxor xmm1,[r9+16]) diff --git a/code/scrypt-jane-mix_chacha.h b/scrypt-jane/code/scrypt-jane-mix_chacha.h similarity index 100% rename from code/scrypt-jane-mix_chacha.h rename to scrypt-jane/code/scrypt-jane-mix_chacha.h diff --git a/code/scrypt-jane-mix_salsa-avx.h b/scrypt-jane/code/scrypt-jane-mix_salsa-avx.h similarity index 99% rename from code/scrypt-jane-mix_salsa-avx.h rename to scrypt-jane/code/scrypt-jane-mix_salsa-avx.h index 15fb48e39..1ca90b5fa 100644 --- a/code/scrypt-jane-mix_salsa-avx.h +++ b/scrypt-jane/code/scrypt-jane-mix_salsa-avx.h @@ -120,7 +120,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a1(pop esi) a1(pop edi) a1(pop ebx) - a1(ret 16) + aret(16) asm_naked_fn_end(scrypt_ChunkMix_avx) #endif diff --git a/code/scrypt-jane-mix_salsa-sse2.h b/scrypt-jane/code/scrypt-jane-mix_salsa-sse2.h similarity index 99% rename from code/scrypt-jane-mix_salsa-sse2.h rename to scrypt-jane/code/scrypt-jane-mix_salsa-sse2.h index 4898659e6..ecc5f0f8d 100644 --- a/code/scrypt-jane-mix_salsa-sse2.h +++ b/scrypt-jane/code/scrypt-jane-mix_salsa-sse2.h @@ -136,7 +136,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a1(pop esi) a1(pop edi) a1(pop ebx) - a1(ret 16) + aret(16) asm_naked_fn_end(scrypt_ChunkMix_sse2) #endif @@ -426,7 +426,7 @@ scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes] 4 9 14 3 */ - static void STDCALL + static void asm_calling_convention salsa_core_tangle_sse2(uint32_t *blocks, size_t count) { uint32_t t; while (count--) { diff --git a/code/scrypt-jane-mix_salsa.h b/scrypt-jane/code/scrypt-jane-mix_salsa.h similarity index 100% rename from code/scrypt-jane-mix_salsa.h rename to scrypt-jane/code/scrypt-jane-mix_salsa.h diff --git a/scrypt-jane/code/scrypt-jane-mix_salsa64-avx.h b/scrypt-jane/code/scrypt-jane-mix_salsa64-avx.h new file mode 100644 index 000000000..50c9902d5 --- /dev/null +++ b/scrypt-jane/code/scrypt-jane-mix_salsa64-avx.h @@ -0,0 +1,367 @@ +/* x64 */ +#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) + +#define SCRYPT_SALSA64_AVX + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx) + a1(push rbp) + a2(mov rbp, rsp) + a2(and rsp, ~63) + a2(sub rsp, 128) + a2(lea rcx,[rcx*2]) + a2(shl rcx,7) + a2(lea r9,[rcx-128]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(vmovdqa xmm0,[rax+0]) + a2(vmovdqa xmm1,[rax+16]) + a2(vmovdqa xmm2,[rax+32]) + a2(vmovdqa xmm3,[rax+48]) + a2(vmovdqa xmm4,[rax+64]) + a2(vmovdqa xmm5,[rax+80]) + a2(vmovdqa xmm6,[rax+96]) + a2(vmovdqa xmm7,[rax+112]) + a1(jz scrypt_ChunkMix_avx_no_xor1) + a3(vpxor xmm0,xmm0,[r9+0]) + a3(vpxor xmm1,xmm1,[r9+16]) + a3(vpxor xmm2,xmm2,[r9+32]) + a3(vpxor xmm3,xmm3,[r9+48]) + a3(vpxor xmm4,xmm4,[r9+64]) + a3(vpxor xmm5,xmm5,[r9+80]) + a3(vpxor xmm6,xmm6,[r9+96]) + a3(vpxor xmm7,xmm7,[r9+112]) + a1(scrypt_ChunkMix_avx_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_avx_loop:) + a2(and rdx, rdx) + a3(vpxor xmm0,xmm0,[rsi+r9+0]) + a3(vpxor xmm1,xmm1,[rsi+r9+16]) + a3(vpxor xmm2,xmm2,[rsi+r9+32]) + a3(vpxor xmm3,xmm3,[rsi+r9+48]) + a3(vpxor xmm4,xmm4,[rsi+r9+64]) + a3(vpxor xmm5,xmm5,[rsi+r9+80]) + a3(vpxor xmm6,xmm6,[rsi+r9+96]) + a3(vpxor xmm7,xmm7,[rsi+r9+112]) + a1(jz scrypt_ChunkMix_avx_no_xor2) + a3(vpxor xmm0,xmm0,[rdx+r9+0]) + a3(vpxor xmm1,xmm1,[rdx+r9+16]) + a3(vpxor xmm2,xmm2,[rdx+r9+32]) + a3(vpxor xmm3,xmm3,[rdx+r9+48]) + a3(vpxor xmm4,xmm4,[rdx+r9+64]) + a3(vpxor xmm5,xmm5,[rdx+r9+80]) + a3(vpxor xmm6,xmm6,[rdx+r9+96]) + a3(vpxor xmm7,xmm7,[rdx+r9+112]) + a1(scrypt_ChunkMix_avx_no_xor2:) + a2(vmovdqa [rsp+0],xmm0) + a2(vmovdqa [rsp+16],xmm1) + a2(vmovdqa [rsp+32],xmm2) + a2(vmovdqa [rsp+48],xmm3) + a2(vmovdqa [rsp+64],xmm4) + a2(vmovdqa [rsp+80],xmm5) + a2(vmovdqa [rsp+96],xmm6) + a2(vmovdqa [rsp+112],xmm7) + a2(mov rax,8) + a1(scrypt_salsa64_avx_loop: ) + a3(vpaddq xmm8, xmm0, xmm2) + a3(vpaddq xmm9, xmm1, xmm3) + a3(vpshufd xmm8, xmm8, 0xb1) + a3(vpshufd xmm9, xmm9, 0xb1) + a3(vpxor xmm6, xmm6, xmm8) + a3(vpxor xmm7, xmm7, xmm9) + a3(vpaddq xmm10, xmm0, xmm6) + a3(vpaddq xmm11, xmm1, xmm7) + a3(vpsrlq xmm8, xmm10, 51) + a3(vpsrlq xmm9, xmm11, 51) + a3(vpsllq xmm10, xmm10, 13) + a3(vpsllq xmm11, xmm11, 13) + a3(vpxor xmm4, xmm4, xmm8) + a3(vpxor xmm5, xmm5, xmm9) + a3(vpxor xmm4, xmm4, xmm10) + a3(vpxor xmm5, xmm5, xmm11) + a3(vpaddq xmm8, xmm6, xmm4) + a3(vpaddq xmm9, xmm7, xmm5) + a3(vpsrlq xmm10, xmm8, 25) + a3(vpsrlq xmm11, xmm9, 25) + a3(vpsllq xmm8, xmm8, 39) + a3(vpsllq xmm9, xmm9, 39) + a3(vpxor xmm2, xmm2, xmm10) + a3(vpxor xmm3, xmm3, xmm11) + a3(vpxor xmm2, xmm2, xmm8) + a3(vpxor xmm3, xmm3, xmm9) + a3(vpaddq xmm10, xmm4, xmm2) + a3(vpaddq xmm11, xmm5, xmm3) + a3(vpshufd xmm10, xmm10, 0xb1) + a3(vpshufd xmm11, xmm11, 0xb1) + a3(vpxor xmm0, xmm0, xmm10) + a3(vpxor xmm1, xmm1, xmm11) + a2(vmovdqa xmm8, xmm2) + a2(vmovdqa xmm9, xmm3) + a4(vpalignr xmm2, xmm6, xmm7, 8) + a4(vpalignr xmm3, xmm7, xmm6, 8) + a4(vpalignr xmm6, xmm9, xmm8, 8) + a4(vpalignr xmm7, xmm8, xmm9, 8) + a2(sub rax, 2) + a3(vpaddq xmm10, xmm0, xmm2) + a3(vpaddq xmm11, xmm1, xmm3) + a3(vpshufd xmm10, xmm10, 0xb1) + a3(vpshufd xmm11, xmm11, 0xb1) + a3(vpxor xmm6, xmm6, xmm10) + a3(vpxor xmm7, xmm7, xmm11) + a3(vpaddq xmm8, xmm0, xmm6) + a3(vpaddq xmm9, xmm1, xmm7) + a3(vpsrlq xmm10, xmm8, 51) + a3(vpsrlq xmm11, xmm9, 51) + a3(vpsllq xmm8, xmm8, 13) + a3(vpsllq xmm9, xmm9, 13) + a3(vpxor xmm5, xmm5, xmm10) + a3(vpxor xmm4, xmm4, xmm11) + a3(vpxor xmm5, xmm5, xmm8) + a3(vpxor xmm4, xmm4, xmm9) + a3(vpaddq xmm10, xmm6, xmm5) + a3(vpaddq xmm11, xmm7, xmm4) + a3(vpsrlq xmm8, xmm10, 25) + a3(vpsrlq xmm9, xmm11, 25) + a3(vpsllq xmm10, xmm10, 39) + a3(vpsllq xmm11, xmm11, 39) + a3(vpxor xmm2, xmm2, xmm8) + a3(vpxor xmm3, xmm3, xmm9) + a3(vpxor xmm2, xmm2, xmm10) + a3(vpxor xmm3, xmm3, xmm11) + a3(vpaddq xmm8, xmm5, xmm2) + a3(vpaddq xmm9, xmm4, xmm3) + a3(vpshufd xmm8, xmm8, 0xb1) + a3(vpshufd xmm9, xmm9, 0xb1) + a3(vpxor xmm0, xmm0, xmm8) + a3(vpxor xmm1, xmm1, xmm9) + a2(vmovdqa xmm10, xmm2) + a2(vmovdqa xmm11, xmm3) + a4(vpalignr xmm2, xmm6, xmm7, 8) + a4(vpalignr xmm3, xmm7, xmm6, 8) + a4(vpalignr xmm6, xmm11, xmm10, 8) + a4(vpalignr xmm7, xmm10, xmm11, 8) + a1(ja scrypt_salsa64_avx_loop) + a3(vpaddq xmm0,xmm0,[rsp+0]) + a3(vpaddq xmm1,xmm1,[rsp+16]) + a3(vpaddq xmm2,xmm2,[rsp+32]) + a3(vpaddq xmm3,xmm3,[rsp+48]) + a3(vpaddq xmm4,xmm4,[rsp+64]) + a3(vpaddq xmm5,xmm5,[rsp+80]) + a3(vpaddq xmm6,xmm6,[rsp+96]) + a3(vpaddq xmm7,xmm7,[rsp+112]) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0xff) + a2(add r9,128) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(vmovdqa [rax+0],xmm0) + a2(vmovdqa [rax+16],xmm1) + a2(vmovdqa [rax+32],xmm2) + a2(vmovdqa [rax+48],xmm3) + a2(vmovdqa [rax+64],xmm4) + a2(vmovdqa [rax+80],xmm5) + a2(vmovdqa [rax+96],xmm6) + a2(vmovdqa [rax+112],xmm7) + a1(jne scrypt_ChunkMix_avx_loop) + a2(mov rsp, rbp) + a1(pop rbp) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_avx) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_AVX) + +#define SCRYPT_SALSA64_AVX + +static void asm_calling_convention +scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + x4 = xmmp[4]; + x5 = xmmp[5]; + x6 = xmmp[6]; + x7 = xmmp[7]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + t4 = x4; + t5 = x5; + t6 = x6; + t7 = x7; + + for (rounds = 8; rounds; rounds -= 2) { + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z2 = _mm_srli_epi64(z0, 64-13); + z3 = _mm_srli_epi64(z1, 64-13); + z0 = _mm_slli_epi64(z0, 13); + z1 = _mm_slli_epi64(z1, 13); + x4 = _mm_xor_si128(x4, z2); + x5 = _mm_xor_si128(x5, z3); + x4 = _mm_xor_si128(x4, z0); + x5 = _mm_xor_si128(x5, z1); + + z0 = _mm_add_epi64(x4, x6); + z1 = _mm_add_epi64(x5, x7); + z2 = _mm_srli_epi64(z0, 64-39); + z3 = _mm_srli_epi64(z1, 64-39); + z0 = _mm_slli_epi64(z0, 39); + z1 = _mm_slli_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z2); + x3 = _mm_xor_si128(x3, z3); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x4); + z1 = _mm_add_epi64(x3, x5); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x2; + z1 = x3; + x2 = _mm_alignr_epi8(x6, x7, 8); + x3 = _mm_alignr_epi8(x7, x6, 8); + x6 = _mm_alignr_epi8(z1, z0, 8); + x7 = _mm_alignr_epi8(z0, z1, 8); + + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z2 = _mm_srli_epi64(z0, 64-13); + z3 = _mm_srli_epi64(z1, 64-13); + z0 = _mm_slli_epi64(z0, 13); + z1 = _mm_slli_epi64(z1, 13); + x5 = _mm_xor_si128(x5, z2); + x4 = _mm_xor_si128(x4, z3); + x5 = _mm_xor_si128(x5, z0); + x4 = _mm_xor_si128(x4, z1); + + z0 = _mm_add_epi64(x5, x6); + z1 = _mm_add_epi64(x4, x7); + z2 = _mm_srli_epi64(z0, 64-39); + z3 = _mm_srli_epi64(z1, 64-39); + z0 = _mm_slli_epi64(z0, 39); + z1 = _mm_slli_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z2); + x3 = _mm_xor_si128(x3, z3); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x5); + z1 = _mm_add_epi64(x3, x4); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x2; + z1 = x3; + x2 = _mm_alignr_epi8(x6, x7, 8); + x3 = _mm_alignr_epi8(x7, x6, 8); + x6 = _mm_alignr_epi8(z1, z0, 8); + x7 = _mm_alignr_epi8(z0, z1, 8); + } + + x0 = _mm_add_epi64(x0, t0); + x1 = _mm_add_epi64(x1, t1); + x2 = _mm_add_epi64(x2, t2); + x3 = _mm_add_epi64(x3, t3); + x4 = _mm_add_epi64(x4, t4); + x5 = _mm_add_epi64(x5, t5); + x6 = _mm_add_epi64(x6, t6); + x7 = _mm_add_epi64(x7, t7); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + xmmp[4] = x4; + xmmp[5] = x5; + xmmp[6] = x6; + xmmp[7] = x7; + } +} + +#endif + +#if defined(SCRYPT_SALSA64_AVX) + /* uses salsa64_core_tangle_sse2 */ + + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa64/8-AVX" + #undef SCRYPT_SALSA64_INCLUDED + #define SCRYPT_SALSA64_INCLUDED +#endif diff --git a/scrypt-jane/code/scrypt-jane-mix_salsa64-sse2.h b/scrypt-jane/code/scrypt-jane-mix_salsa64-sse2.h new file mode 100644 index 000000000..f8d957432 --- /dev/null +++ b/scrypt-jane/code/scrypt-jane-mix_salsa64-sse2.h @@ -0,0 +1,449 @@ +/* x64 */ +#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) + +#define SCRYPT_SALSA64_SSE2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_sse2) + a1(push rbp) + a2(mov rbp, rsp) + a2(and rsp, ~63) + a2(sub rsp, 128) + a2(lea rcx,[rcx*2]) + a2(shl rcx,7) + a2(lea r9,[rcx-128]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(movdqa xmm0,[rax+0]) + a2(movdqa xmm1,[rax+16]) + a2(movdqa xmm2,[rax+32]) + a2(movdqa xmm3,[rax+48]) + a2(movdqa xmm4,[rax+64]) + a2(movdqa xmm5,[rax+80]) + a2(movdqa xmm6,[rax+96]) + a2(movdqa xmm7,[rax+112]) + a1(jz scrypt_ChunkMix_sse2_no_xor1) + a2(pxor xmm0,[r9+0]) + a2(pxor xmm1,[r9+16]) + a2(pxor xmm2,[r9+32]) + a2(pxor xmm3,[r9+48]) + a2(pxor xmm4,[r9+64]) + a2(pxor xmm5,[r9+80]) + a2(pxor xmm6,[r9+96]) + a2(pxor xmm7,[r9+112]) + a1(scrypt_ChunkMix_sse2_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_sse2_loop:) + a2(and rdx, rdx) + a2(pxor xmm0,[rsi+r9+0]) + a2(pxor xmm1,[rsi+r9+16]) + a2(pxor xmm2,[rsi+r9+32]) + a2(pxor xmm3,[rsi+r9+48]) + a2(pxor xmm4,[rsi+r9+64]) + a2(pxor xmm5,[rsi+r9+80]) + a2(pxor xmm6,[rsi+r9+96]) + a2(pxor xmm7,[rsi+r9+112]) + a1(jz scrypt_ChunkMix_sse2_no_xor2) + a2(pxor xmm0,[rdx+r9+0]) + a2(pxor xmm1,[rdx+r9+16]) + a2(pxor xmm2,[rdx+r9+32]) + a2(pxor xmm3,[rdx+r9+48]) + a2(pxor xmm4,[rdx+r9+64]) + a2(pxor xmm5,[rdx+r9+80]) + a2(pxor xmm6,[rdx+r9+96]) + a2(pxor xmm7,[rdx+r9+112]) + a1(scrypt_ChunkMix_sse2_no_xor2:) + a2(movdqa [rsp+0],xmm0) + a2(movdqa [rsp+16],xmm1) + a2(movdqa [rsp+32],xmm2) + a2(movdqa [rsp+48],xmm3) + a2(movdqa [rsp+64],xmm4) + a2(movdqa [rsp+80],xmm5) + a2(movdqa [rsp+96],xmm6) + a2(movdqa [rsp+112],xmm7) + a2(mov rax,8) + a1(scrypt_salsa64_sse2_loop: ) + a2(movdqa xmm8, xmm0) + a2(movdqa xmm9, xmm1) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm6, xmm8) + a2(pxor xmm7, xmm9) + a2(movdqa xmm10, xmm0) + a2(movdqa xmm11, xmm1) + a2(paddq xmm10, xmm6) + a2(paddq xmm11, xmm7) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 51) + a2(psrlq xmm11, 51) + a2(psllq xmm8, 13) + a2(psllq xmm9, 13) + a2(pxor xmm4, xmm10) + a2(pxor xmm5, xmm11) + a2(pxor xmm4, xmm8) + a2(pxor xmm5, xmm9) + a2(movdqa xmm10, xmm6) + a2(movdqa xmm11, xmm7) + a2(paddq xmm10, xmm4) + a2(paddq xmm11, xmm5) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 25) + a2(psrlq xmm11, 25) + a2(psllq xmm8, 39) + a2(psllq xmm9, 39) + a2(pxor xmm2, xmm10) + a2(pxor xmm3, xmm11) + a2(pxor xmm2, xmm8) + a2(pxor xmm3, xmm9) + a2(movdqa xmm8, xmm4) + a2(movdqa xmm9, xmm5) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm0, xmm8) + a2(pxor xmm1, xmm9) + a2(movdqa xmm8, xmm2) + a2(movdqa xmm9, xmm3) + a2(movdqa xmm10, xmm6) + a2(movdqa xmm11, xmm7) + a2(movdqa xmm2, xmm7) + a2(movdqa xmm3, xmm6) + a2(punpcklqdq xmm10, xmm6) + a2(punpcklqdq xmm11, xmm7) + a2(movdqa xmm6, xmm8) + a2(movdqa xmm7, xmm9) + a2(punpcklqdq xmm9, xmm9) + a2(punpcklqdq xmm8, xmm8) + a2(punpckhqdq xmm2, xmm10) + a2(punpckhqdq xmm3, xmm11) + a2(punpckhqdq xmm6, xmm9) + a2(punpckhqdq xmm7, xmm8) + a2(sub rax, 2) + a2(movdqa xmm8, xmm0) + a2(movdqa xmm9, xmm1) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm6, xmm8) + a2(pxor xmm7, xmm9) + a2(movdqa xmm10, xmm0) + a2(movdqa xmm11, xmm1) + a2(paddq xmm10, xmm6) + a2(paddq xmm11, xmm7) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 51) + a2(psrlq xmm11, 51) + a2(psllq xmm8, 13) + a2(psllq xmm9, 13) + a2(pxor xmm5, xmm10) + a2(pxor xmm4, xmm11) + a2(pxor xmm5, xmm8) + a2(pxor xmm4, xmm9) + a2(movdqa xmm10, xmm6) + a2(movdqa xmm11, xmm7) + a2(paddq xmm10, xmm5) + a2(paddq xmm11, xmm4) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 25) + a2(psrlq xmm11, 25) + a2(psllq xmm8, 39) + a2(psllq xmm9, 39) + a2(pxor xmm2, xmm10) + a2(pxor xmm3, xmm11) + a2(pxor xmm2, xmm8) + a2(pxor xmm3, xmm9) + a2(movdqa xmm8, xmm5) + a2(movdqa xmm9, xmm4) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm0, xmm8) + a2(pxor xmm1, xmm9) + a2(movdqa xmm8, xmm2) + a2(movdqa xmm9, xmm3) + a2(movdqa xmm10, xmm6) + a2(movdqa xmm11, xmm7) + a2(movdqa xmm2, xmm7) + a2(movdqa xmm3, xmm6) + a2(punpcklqdq xmm10, xmm6) + a2(punpcklqdq xmm11, xmm7) + a2(movdqa xmm6, xmm8) + a2(movdqa xmm7, xmm9) + a2(punpcklqdq xmm9, xmm9) + a2(punpcklqdq xmm8, xmm8) + a2(punpckhqdq xmm2, xmm10) + a2(punpckhqdq xmm3, xmm11) + a2(punpckhqdq xmm6, xmm9) + a2(punpckhqdq xmm7, xmm8) + a1(ja scrypt_salsa64_sse2_loop) + a2(paddq xmm0,[rsp+0]) + a2(paddq xmm1,[rsp+16]) + a2(paddq xmm2,[rsp+32]) + a2(paddq xmm3,[rsp+48]) + a2(paddq xmm4,[rsp+64]) + a2(paddq xmm5,[rsp+80]) + a2(paddq xmm6,[rsp+96]) + a2(paddq xmm7,[rsp+112]) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0xff) + a2(add r9,128) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(movdqa [rax+0],xmm0) + a2(movdqa [rax+16],xmm1) + a2(movdqa [rax+32],xmm2) + a2(movdqa [rax+48],xmm3) + a2(movdqa [rax+64],xmm4) + a2(movdqa [rax+80],xmm5) + a2(movdqa [rax+96],xmm6) + a2(movdqa [rax+112],xmm7) + a1(jne scrypt_ChunkMix_sse2_loop) + a2(mov rsp, rbp) + a1(pop rbp) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_sse2) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSE2) + +#define SCRYPT_SALSA64_SSE2 + +static void asm_calling_convention +scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + x4 = xmmp[4]; + x5 = xmmp[5]; + x6 = xmmp[6]; + x7 = xmmp[7]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + t4 = x4; + t5 = x5; + t6 = x6; + t7 = x7; + + for (rounds = 8; rounds; rounds -= 2) { + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z2 = _mm_srli_epi64(z0, 64-13); + z3 = _mm_srli_epi64(z1, 64-13); + z0 = _mm_slli_epi64(z0, 13); + z1 = _mm_slli_epi64(z1, 13); + x4 = _mm_xor_si128(x4, z2); + x5 = _mm_xor_si128(x5, z3); + x4 = _mm_xor_si128(x4, z0); + x5 = _mm_xor_si128(x5, z1); + + z0 = _mm_add_epi64(x4, x6); + z1 = _mm_add_epi64(x5, x7); + z2 = _mm_srli_epi64(z0, 64-39); + z3 = _mm_srli_epi64(z1, 64-39); + z0 = _mm_slli_epi64(z0, 39); + z1 = _mm_slli_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z2); + x3 = _mm_xor_si128(x3, z3); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x4); + z1 = _mm_add_epi64(x3, x5); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x4; + z1 = x5; + z2 = x2; + z3 = x3; + x4 = z1; + x5 = z0; + x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6)); + x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7)); + x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3)); + x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2)); + + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z2 = _mm_srli_epi64(z0, 64-13); + z3 = _mm_srli_epi64(z1, 64-13); + z0 = _mm_slli_epi64(z0, 13); + z1 = _mm_slli_epi64(z1, 13); + x4 = _mm_xor_si128(x4, z2); + x5 = _mm_xor_si128(x5, z3); + x4 = _mm_xor_si128(x4, z0); + x5 = _mm_xor_si128(x5, z1); + + z0 = _mm_add_epi64(x4, x6); + z1 = _mm_add_epi64(x5, x7); + z2 = _mm_srli_epi64(z0, 64-39); + z3 = _mm_srli_epi64(z1, 64-39); + z0 = _mm_slli_epi64(z0, 39); + z1 = _mm_slli_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z2); + x3 = _mm_xor_si128(x3, z3); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x4); + z1 = _mm_add_epi64(x3, x5); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x4; + z1 = x5; + z2 = x2; + z3 = x3; + x4 = z1; + x5 = z0; + x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6)); + x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7)); + x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3)); + x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2)); + } + + x0 = _mm_add_epi64(x0, t0); + x1 = _mm_add_epi64(x1, t1); + x2 = _mm_add_epi64(x2, t2); + x3 = _mm_add_epi64(x3, t3); + x4 = _mm_add_epi64(x4, t4); + x5 = _mm_add_epi64(x5, t5); + x6 = _mm_add_epi64(x6, t6); + x7 = _mm_add_epi64(x7, t7); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + xmmp[4] = x4; + xmmp[5] = x5; + xmmp[6] = x6; + xmmp[7] = x7; + } +} + +#endif + +#if defined(SCRYPT_SALSA64_SSE2) + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa64/8-SSE2" + #undef SCRYPT_SALSA64_INCLUDED + #define SCRYPT_SALSA64_INCLUDED +#endif + +/* sse3/avx use this as well */ +#if defined(SCRYPT_SALSA64_INCLUDED) + /* + Default layout: + 0 1 2 3 + 4 5 6 7 + 8 9 10 11 + 12 13 14 15 + + SSE2 layout: + 0 5 10 15 + 12 1 6 11 + 8 13 2 7 + 4 9 14 3 + */ + + + static void asm_calling_convention + salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) { + uint64_t t; + while (count--) { + t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t; + t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t; + t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t; + t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t; + t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t; + t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t; + blocks += 16; + } + } +#endif \ No newline at end of file diff --git a/scrypt-jane/code/scrypt-jane-mix_salsa64-ssse3.h b/scrypt-jane/code/scrypt-jane-mix_salsa64-ssse3.h new file mode 100644 index 000000000..105efa83f --- /dev/null +++ b/scrypt-jane/code/scrypt-jane-mix_salsa64-ssse3.h @@ -0,0 +1,399 @@ +/* x64 */ +#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) + +#define SCRYPT_SALSA64_SSSE3 + +asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_ssse3) + a1(push rbp) + a2(mov rbp, rsp) + a2(and rsp, ~63) + a2(sub rsp, 128) + a2(lea rcx,[rcx*2]) + a2(shl rcx,7) + a2(lea r9,[rcx-128]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(movdqa xmm0,[rax+0]) + a2(movdqa xmm1,[rax+16]) + a2(movdqa xmm2,[rax+32]) + a2(movdqa xmm3,[rax+48]) + a2(movdqa xmm4,[rax+64]) + a2(movdqa xmm5,[rax+80]) + a2(movdqa xmm6,[rax+96]) + a2(movdqa xmm7,[rax+112]) + a1(jz scrypt_ChunkMix_ssse3_no_xor1) + a2(pxor xmm0,[r9+0]) + a2(pxor xmm1,[r9+16]) + a2(pxor xmm2,[r9+32]) + a2(pxor xmm3,[r9+48]) + a2(pxor xmm4,[r9+64]) + a2(pxor xmm5,[r9+80]) + a2(pxor xmm6,[r9+96]) + a2(pxor xmm7,[r9+112]) + a1(scrypt_ChunkMix_ssse3_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_ssse3_loop:) + a2(and rdx, rdx) + a2(pxor xmm0,[rsi+r9+0]) + a2(pxor xmm1,[rsi+r9+16]) + a2(pxor xmm2,[rsi+r9+32]) + a2(pxor xmm3,[rsi+r9+48]) + a2(pxor xmm4,[rsi+r9+64]) + a2(pxor xmm5,[rsi+r9+80]) + a2(pxor xmm6,[rsi+r9+96]) + a2(pxor xmm7,[rsi+r9+112]) + a1(jz scrypt_ChunkMix_ssse3_no_xor2) + a2(pxor xmm0,[rdx+r9+0]) + a2(pxor xmm1,[rdx+r9+16]) + a2(pxor xmm2,[rdx+r9+32]) + a2(pxor xmm3,[rdx+r9+48]) + a2(pxor xmm4,[rdx+r9+64]) + a2(pxor xmm5,[rdx+r9+80]) + a2(pxor xmm6,[rdx+r9+96]) + a2(pxor xmm7,[rdx+r9+112]) + a1(scrypt_ChunkMix_ssse3_no_xor2:) + a2(movdqa [rsp+0],xmm0) + a2(movdqa [rsp+16],xmm1) + a2(movdqa [rsp+32],xmm2) + a2(movdqa [rsp+48],xmm3) + a2(movdqa [rsp+64],xmm4) + a2(movdqa [rsp+80],xmm5) + a2(movdqa [rsp+96],xmm6) + a2(movdqa [rsp+112],xmm7) + a2(mov rax,8) + a1(scrypt_salsa64_ssse3_loop: ) + a2(movdqa xmm8, xmm0) + a2(movdqa xmm9, xmm1) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm6, xmm8) + a2(pxor xmm7, xmm9) + a2(movdqa xmm10, xmm0) + a2(movdqa xmm11, xmm1) + a2(paddq xmm10, xmm6) + a2(paddq xmm11, xmm7) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 51) + a2(psrlq xmm11, 51) + a2(psllq xmm8, 13) + a2(psllq xmm9, 13) + a2(pxor xmm4, xmm10) + a2(pxor xmm5, xmm11) + a2(pxor xmm4, xmm8) + a2(pxor xmm5, xmm9) + a2(movdqa xmm10, xmm6) + a2(movdqa xmm11, xmm7) + a2(paddq xmm10, xmm4) + a2(paddq xmm11, xmm5) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 25) + a2(psrlq xmm11, 25) + a2(psllq xmm8, 39) + a2(psllq xmm9, 39) + a2(pxor xmm2, xmm10) + a2(pxor xmm3, xmm11) + a2(pxor xmm2, xmm8) + a2(pxor xmm3, xmm9) + a2(movdqa xmm8, xmm4) + a2(movdqa xmm9, xmm5) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm0, xmm8) + a2(pxor xmm1, xmm9) + a2(movdqa xmm10, xmm2) + a2(movdqa xmm11, xmm3) + a2(movdqa xmm2, xmm6) + a2(movdqa xmm3, xmm7) + a3(palignr xmm2, xmm7, 8) + a3(palignr xmm3, xmm6, 8) + a2(movdqa xmm6, xmm11) + a2(movdqa xmm7, xmm10) + a3(palignr xmm6, xmm10, 8) + a3(palignr xmm7, xmm11, 8) + a2(sub rax, 2) + a2(movdqa xmm8, xmm0) + a2(movdqa xmm9, xmm1) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm6, xmm8) + a2(pxor xmm7, xmm9) + a2(movdqa xmm10, xmm0) + a2(movdqa xmm11, xmm1) + a2(paddq xmm10, xmm6) + a2(paddq xmm11, xmm7) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 51) + a2(psrlq xmm11, 51) + a2(psllq xmm8, 13) + a2(psllq xmm9, 13) + a2(pxor xmm5, xmm10) + a2(pxor xmm4, xmm11) + a2(pxor xmm5, xmm8) + a2(pxor xmm4, xmm9) + a2(movdqa xmm10, xmm6) + a2(movdqa xmm11, xmm7) + a2(paddq xmm10, xmm5) + a2(paddq xmm11, xmm4) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 25) + a2(psrlq xmm11, 25) + a2(psllq xmm8, 39) + a2(psllq xmm9, 39) + a2(pxor xmm2, xmm10) + a2(pxor xmm3, xmm11) + a2(pxor xmm2, xmm8) + a2(pxor xmm3, xmm9) + a2(movdqa xmm8, xmm5) + a2(movdqa xmm9, xmm4) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm0, xmm8) + a2(pxor xmm1, xmm9) + a2(movdqa xmm10, xmm2) + a2(movdqa xmm11, xmm3) + a2(movdqa xmm2, xmm6) + a2(movdqa xmm3, xmm7) + a3(palignr xmm2, xmm7, 8) + a3(palignr xmm3, xmm6, 8) + a2(movdqa xmm6, xmm11) + a2(movdqa xmm7, xmm10) + a3(palignr xmm6, xmm10, 8) + a3(palignr xmm7, xmm11, 8) + a1(ja scrypt_salsa64_ssse3_loop) + a2(paddq xmm0,[rsp+0]) + a2(paddq xmm1,[rsp+16]) + a2(paddq xmm2,[rsp+32]) + a2(paddq xmm3,[rsp+48]) + a2(paddq xmm4,[rsp+64]) + a2(paddq xmm5,[rsp+80]) + a2(paddq xmm6,[rsp+96]) + a2(paddq xmm7,[rsp+112]) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0xff) + a2(add r9,128) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(movdqa [rax+0],xmm0) + a2(movdqa [rax+16],xmm1) + a2(movdqa [rax+32],xmm2) + a2(movdqa [rax+48],xmm3) + a2(movdqa [rax+64],xmm4) + a2(movdqa [rax+80],xmm5) + a2(movdqa [rax+96],xmm6) + a2(movdqa [rax+112],xmm7) + a1(jne scrypt_ChunkMix_ssse3_loop) + a2(mov rsp, rbp) + a1(pop rbp) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_ssse3) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSSE3) + +#define SCRYPT_SALSA64_SSSE3 + +static void asm_calling_convention +scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + x4 = xmmp[4]; + x5 = xmmp[5]; + x6 = xmmp[6]; + x7 = xmmp[7]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + t4 = x4; + t5 = x5; + t6 = x6; + t7 = x7; + + for (rounds = 8; rounds; rounds -= 2) { + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z2 = _mm_srli_epi64(z0, 64-13); + z3 = _mm_srli_epi64(z1, 64-13); + z0 = _mm_slli_epi64(z0, 13); + z1 = _mm_slli_epi64(z1, 13); + x4 = _mm_xor_si128(x4, z2); + x5 = _mm_xor_si128(x5, z3); + x4 = _mm_xor_si128(x4, z0); + x5 = _mm_xor_si128(x5, z1); + + z0 = _mm_add_epi64(x4, x6); + z1 = _mm_add_epi64(x5, x7); + z2 = _mm_srli_epi64(z0, 64-39); + z3 = _mm_srli_epi64(z1, 64-39); + z0 = _mm_slli_epi64(z0, 39); + z1 = _mm_slli_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z2); + x3 = _mm_xor_si128(x3, z3); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x4); + z1 = _mm_add_epi64(x3, x5); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x2; + z1 = x3; + x2 = _mm_alignr_epi8(x6, x7, 8); + x3 = _mm_alignr_epi8(x7, x6, 8); + x6 = _mm_alignr_epi8(z1, z0, 8); + x7 = _mm_alignr_epi8(z0, z1, 8); + + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z2 = _mm_srli_epi64(z0, 64-13); + z3 = _mm_srli_epi64(z1, 64-13); + z0 = _mm_slli_epi64(z0, 13); + z1 = _mm_slli_epi64(z1, 13); + x5 = _mm_xor_si128(x5, z2); + x4 = _mm_xor_si128(x4, z3); + x5 = _mm_xor_si128(x5, z0); + x4 = _mm_xor_si128(x4, z1); + + z0 = _mm_add_epi64(x5, x6); + z1 = _mm_add_epi64(x4, x7); + z2 = _mm_srli_epi64(z0, 64-39); + z3 = _mm_srli_epi64(z1, 64-39); + z0 = _mm_slli_epi64(z0, 39); + z1 = _mm_slli_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z2); + x3 = _mm_xor_si128(x3, z3); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x5); + z1 = _mm_add_epi64(x3, x4); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x2; + z1 = x3; + x2 = _mm_alignr_epi8(x6, x7, 8); + x3 = _mm_alignr_epi8(x7, x6, 8); + x6 = _mm_alignr_epi8(z1, z0, 8); + x7 = _mm_alignr_epi8(z0, z1, 8); + } + + x0 = _mm_add_epi64(x0, t0); + x1 = _mm_add_epi64(x1, t1); + x2 = _mm_add_epi64(x2, t2); + x3 = _mm_add_epi64(x3, t3); + x4 = _mm_add_epi64(x4, t4); + x5 = _mm_add_epi64(x5, t5); + x6 = _mm_add_epi64(x6, t6); + x7 = _mm_add_epi64(x7, t7); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + xmmp[4] = x4; + xmmp[5] = x5; + xmmp[6] = x6; + xmmp[7] = x7; + } +} + +#endif + +#if defined(SCRYPT_SALSA64_SSSE3) + /* uses salsa64_core_tangle_sse2 */ + + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa64/8-SSSE3" + #undef SCRYPT_SALSA64_INCLUDED + #define SCRYPT_SALSA64_INCLUDED +#endif diff --git a/scrypt-jane/code/scrypt-jane-mix_salsa64.h b/scrypt-jane/code/scrypt-jane-mix_salsa64.h new file mode 100644 index 000000000..2aec04f33 --- /dev/null +++ b/scrypt-jane/code/scrypt-jane-mix_salsa64.h @@ -0,0 +1,41 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED) + +#undef SCRYPT_MIX +#define SCRYPT_MIX "Salsa64/8 Ref" + +#undef SCRYPT_SALSA64_INCLUDED +#define SCRYPT_SALSA64_INCLUDED +#define SCRYPT_SALSA64_BASIC + +static void +salsa64_core_basic(uint64_t state[16]) { + const size_t rounds = 8; + uint64_t v[16], t; + size_t i; + + for (i = 0; i < 16; i++) v[i] = state[i]; + + #define G(a,b,c,d) \ + t = v[a]+v[d]; t = ROTL64(t, 32); v[b] ^= t; \ + t = v[b]+v[a]; t = ROTL64(t, 13); v[c] ^= t; \ + t = v[c]+v[b]; t = ROTL64(t, 39); v[d] ^= t; \ + t = v[d]+v[c]; t = ROTL64(t, 32); v[a] ^= t; \ + + for (i = 0; i < rounds; i += 2) { + G( 0, 4, 8,12); + G( 5, 9,13, 1); + G(10,14, 2, 6); + G(15, 3, 7,11); + G( 0, 1, 2, 3); + G( 5, 6, 7, 4); + G(10,11, 8, 9); + G(15,12,13,14); + } + + for (i = 0; i < 16; i++) state[i] += v[i]; + + #undef G +} + +#endif + diff --git a/code/scrypt-jane-pbkdf2.h b/scrypt-jane/code/scrypt-jane-pbkdf2.h similarity index 100% rename from code/scrypt-jane-pbkdf2.h rename to scrypt-jane/code/scrypt-jane-pbkdf2.h diff --git a/code/scrypt-jane-portable-x86.h b/scrypt-jane/code/scrypt-jane-portable-x86.h similarity index 89% rename from code/scrypt-jane-portable-x86.h rename to scrypt-jane/code/scrypt-jane-portable-x86.h index 03282fa8a..192f76497 100644 --- a/code/scrypt-jane-portable-x86.h +++ b/scrypt-jane/code/scrypt-jane-portable-x86.h @@ -115,11 +115,9 @@ } packedelem64; #endif -#if defined(X86_INTRINSIC_SSSE3) || defined(X86ASM_SSSE3) || defined(X86_64ASM_SSSE3) - const packedelem8 MM16 ssse3_rotr16_64bit = {{2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9}}; - const packedelem8 MM16 ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}}; - const packedelem8 MM16 ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}}; - const packedelem8 MM16 ssse3_endian_swap_64bit = {{7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8}}; +#if defined(X86_INTRINSIC_SSSE3) + static const packedelem8 MM16 ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}}; + static const packedelem8 MM16 ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}}; #endif /* @@ -130,7 +128,8 @@ a1(..) a2(.., ..) a3(.., .., ..) - a1(ret) + 64bit OR 0 paramters: a1(ret) + 32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters asm_naked_fn_end(name) */ @@ -147,7 +146,8 @@ #define asm_align8 a1(ALIGN 8) #define asm_align16 a1(ALIGN 16) - #define asm_naked_fn_proto(type, fn) static NAKED type STDCALL fn + #define asm_calling_convention STDCALL + #define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn #define asm_naked_fn(fn) { #define asm_naked_fn_end(fn) } #elif defined(COMPILER_GCC) @@ -156,6 +156,7 @@ #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n" #define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n" #define GNU_ASL(x) "\n" #x ":\n" + #define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n" #define GNU_ASJ(x, y, z) #x " " #y #z ";" #define a1(x) GNU_AS1(x) @@ -167,9 +168,18 @@ #define asm_align8 a1(.align 8) #define asm_align16 a1(.align 16) - #define asm_naked_fn_proto(type, fn) extern type STDCALL fn - #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASL(fn) - #define asm_naked_fn_end(fn) ".att_syntax prefix;\n.type " #fn ",@function\n.size " #fn ",.-" #fn "\n" ); + #if defined(OS_WINDOWS) + #define asm_calling_convention CDECL + #define aret(n) a1(ret) + #define asm_naked_fn_end(fn) ".att_syntax prefix;\n" ); + #else + #define asm_calling_convention STDCALL + #define aret(n) a1(ret n) + #define asm_naked_fn_end(fn) ".att_syntax prefix;\n.type " #fn ",@function\n.size " #fn ",.-" #fn "\n" ); + #endif + #define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn + #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn) + #define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n" #define asm_gcc_parms() ".att_syntax prefix;" #define asm_gcc_trashed() __asm__ __volatile__("" ::: diff --git a/code/scrypt-jane-portable.h b/scrypt-jane/code/scrypt-jane-portable.h similarity index 100% rename from code/scrypt-jane-portable.h rename to scrypt-jane/code/scrypt-jane-portable.h diff --git a/code/scrypt-jane-romix-basic.h b/scrypt-jane/code/scrypt-jane-romix-basic.h similarity index 85% rename from code/scrypt-jane-romix-basic.h rename to scrypt-jane/code/scrypt-jane-romix-basic.h index ca1df02d5..1cdb3fb06 100644 --- a/code/scrypt-jane-romix-basic.h +++ b/scrypt-jane/code/scrypt-jane-romix-basic.h @@ -4,12 +4,12 @@ typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, sc #endif /* romix pre/post nop function */ -static void STDCALL +static void asm_calling_convention scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) { } /* romix pre/post endian conversion function */ -static void STDCALL +static void asm_calling_convention scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) { #if !defined(CPU_LE) static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}}; @@ -24,8 +24,8 @@ scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) { } /* chunkmix test function */ -typedef void (STDCALL *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r); -typedef void (STDCALL *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks); +typedef void (asm_calling_convention *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r); +typedef void (asm_calling_convention *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks); static int scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) { diff --git a/code/scrypt-jane-romix-template.h b/scrypt-jane/code/scrypt-jane-romix-template.h similarity index 98% rename from code/scrypt-jane-romix-template.h rename to scrypt-jane/code/scrypt-jane-romix-template.h index 2fd7674ec..bfb47bb57 100644 --- a/code/scrypt-jane-romix-template.h +++ b/scrypt-jane/code/scrypt-jane-romix-template.h @@ -17,7 +17,7 @@ 2*r: number of blocks in the chunk */ -static void STDCALL +static void asm_calling_convention SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) { scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block; uint32_t i, j, blocksPerChunk = r * 2, half = 0; diff --git a/code/scrypt-jane-romix.h b/scrypt-jane/code/scrypt-jane-romix.h similarity index 100% rename from code/scrypt-jane-romix.h rename to scrypt-jane/code/scrypt-jane-romix.h diff --git a/code/scrypt-jane-salsa.h b/scrypt-jane/code/scrypt-jane-salsa.h similarity index 96% rename from code/scrypt-jane-salsa.h rename to scrypt-jane/code/scrypt-jane-salsa.h index 0c1604bad..76f3da630 100644 --- a/code/scrypt-jane-salsa.h +++ b/scrypt-jane/code/scrypt-jane-salsa.h @@ -64,13 +64,16 @@ scrypt_getROMix() { #if defined(SCRYPT_TEST_SPEED) static size_t available_implementations() { + size_t cpuflags = detect_cpu(); size_t flags = 0; #if defined(SCRYPT_SALSA_AVX) + if (cpuflags & cpu_avx) flags |= cpu_avx; #endif #if defined(SCRYPT_SALSA_SSE2) + if (cpuflags & cpu_sse2) flags |= cpu_sse2; #endif diff --git a/scrypt-jane/code/scrypt-jane-salsa64.h b/scrypt-jane/code/scrypt-jane-salsa64.h new file mode 100644 index 000000000..ecc87f596 --- /dev/null +++ b/scrypt-jane/code/scrypt-jane-salsa64.h @@ -0,0 +1,133 @@ +#define SCRYPT_MIX_BASE "Salsa64/8" + +typedef uint64_t scrypt_mix_word_t; + +#define SCRYPT_WORDTO8_LE U64TO8_LE +#define SCRYPT_WORD_ENDIAN_SWAP U64_SWAP + +#define SCRYPT_BLOCK_BYTES 128 +#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) + +/* must have these here in case block bytes is ever != 64 */ +#include "scrypt-jane-romix-basic.h" + +#include "scrypt-jane-mix_salsa64-avx.h" +#include "scrypt-jane-mix_salsa64-ssse3.h" +#include "scrypt-jane-mix_salsa64-sse2.h" +#include "scrypt-jane-mix_salsa64.h" + +#if defined(SCRYPT_SALSA64_AVX) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx + #define SCRYPT_ROMIX_FN scrypt_ROMix_avx + #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_SALSA64_SSSE3) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3 + #define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3 + #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_SALSA64_SSE2) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 + #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 + #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +/* cpu agnostic */ +#define SCRYPT_ROMIX_FN scrypt_ROMix_basic +#define SCRYPT_MIX_FN salsa64_core_basic +#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian +#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian +#include "scrypt-jane-romix-template.h" + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +static scrypt_ROMixfn +scrypt_getROMix() { + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_SALSA64_AVX) + if (cpuflags & cpu_avx) + return scrypt_ROMix_avx; + else +#endif + +#if defined(SCRYPT_SALSA64_SSSE3) + if (cpuflags & cpu_ssse3) + return scrypt_ROMix_ssse3; + else +#endif + +#if defined(SCRYPT_SALSA64_SSE2) + if (cpuflags & cpu_sse2) + return scrypt_ROMix_sse2; + else +#endif + + return scrypt_ROMix_basic; +} +#endif + + +#if defined(SCRYPT_TEST_SPEED) +static size_t +available_implementations() { + size_t cpuflags = detect_cpu(); + size_t flags = 0; + +#if defined(SCRYPT_SALSA64_AVX) + if (cpuflags & cpu_avx) + flags |= cpu_avx; +#endif + +#if defined(SCRYPT_SALSA64_SSSE3) + if (cpuflags & cpu_ssse3) + flags |= cpu_ssse3; +#endif + +#if defined(SCRYPT_SALSA64_SSE2) + if (cpuflags & cpu_sse2) + flags |= cpu_sse2; +#endif + + return flags; +} +#endif + +static int +scrypt_test_mix() { + static const uint8_t expected[16] = { + 0xf8,0x92,0x9b,0xf8,0xcc,0x1d,0xce,0x2e,0x13,0x82,0xac,0x96,0xb2,0x6c,0xee,0x2c, + }; + + int ret = 1; + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_SALSA64_AVX) + if (cpuflags & cpu_avx) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA64_SSSE3) + if (cpuflags & cpu_ssse3) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA64_SSE2) + if (cpuflags & cpu_sse2) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA64_BASIC) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); +#endif + + return ret; +} + diff --git a/code/scrypt-jane-test-vectors.h b/scrypt-jane/code/scrypt-jane-test-vectors.h similarity index 100% rename from code/scrypt-jane-test-vectors.h rename to scrypt-jane/code/scrypt-jane-test-vectors.h diff --git a/scrypt-jane/example.c b/scrypt-jane/example.c new file mode 100644 index 000000000..909e7970a --- /dev/null +++ b/scrypt-jane/example.c @@ -0,0 +1,13 @@ +#include +#include "scrypt-jane.h" + + +int main() { + unsigned char digest[16]; + int i; + scrypt("pw", 2, "salt", 4, 0, 0, 0, digest, 16); + for (i = 0; i < sizeof(digest); i++) + printf("%02x, ", digest[i]); + printf("\n"); + return 0; +} \ No newline at end of file diff --git a/scrypt-jane/scrypt-jane-speed.c b/scrypt-jane/scrypt-jane-speed.c new file mode 100644 index 000000000..c2cdd4bb4 --- /dev/null +++ b/scrypt-jane/scrypt-jane-speed.c @@ -0,0 +1,121 @@ +#define SCRYPT_TEST_SPEED +#include "scrypt-jane.c" + +/* ticks - not tested on anything other than x86 */ +static uint64_t +get_ticks(void) { +#if defined(CPU_X86) || defined(CPU_X86_64) + #if defined(COMPILER_INTEL) + return _rdtsc(); + #elif defined(COMPILER_MSVC) + return __rdtsc(); + #elif defined(COMPILER_GCC) + uint32_t lo, hi; + __asm__ __volatile__("rdtsc" : "=a" (lo), "=d" (hi)); + return ((uint64_t)lo | ((uint64_t)hi << 32)); + #else + need rdtsc for this compiler + #endif +#elif defined(OS_SOLARIS) + return (uint64_t)gethrtime(); +#elif defined(CPU_SPARC) && !defined(OS_OPENBSD) + uint64_t t; + __asm__ __volatile__("rd %%tick, %0" : "=r" (t)); + return t; +#elif defined(CPU_PPC) + uint32_t lo = 0, hi = 0; + __asm__ __volatile__("mftbu %0; mftb %1" : "=r" (hi), "=r" (lo)); + return ((uint64_t)lo | ((uint64_t)hi << 32)); +#elif defined(CPU_IA64) + uint64_t t; + __asm__ __volatile__("mov %0=ar.itc" : "=r" (t)); + return t; +#elif defined(OS_NIX) + timeval t2; + gettimeofday(&t2, NULL); + t = ((uint64_t)t2.tv_usec << 32) | (uint64_t)t2.tv_sec; + return t; +#else + need ticks for this platform +#endif +} + +#define timeit(x,minvar) { \ + ticks = get_ticks(); \ + x; \ + ticks = get_ticks() - ticks; \ + if (ticks < minvar) \ + minvar = ticks; \ + } + +#define maxticks 0xffffffffffffffffull + +typedef struct scrypt_speed_settings_t { + const char *desc; + uint8_t Nfactor, rfactor, pfactor; +} scrypt_speed_settings; + +/* scrypt_r_32kb is set to a 32kb chunk, so (1 << (scrypt_r_32kb - 5)) = 1kb chunk */ +static const scrypt_speed_settings settings[] = { + {"scrypt high volume ( ~4mb)", 11, scrypt_r_32kb - 5, 0}, + {"scrypt interactive (~16mb)", 13, scrypt_r_32kb - 5, 0}, + {"scrypt non-interactive (~ 1gb)", 19, scrypt_r_32kb - 5, 0}, + {0} +}; + +int main() { + const scrypt_speed_settings *s; + uint8_t password[64], salt[24], digest[64]; + uint64_t minticks, ticks; + size_t i, passes; + size_t cpuflags, topbit; + + for (i = 0; i < sizeof(password); i++) + password[i] = (uint8_t)i; + for (i = 0; i < sizeof(salt); i++) + salt[i] = 255 - (uint8_t)i; + + /* warm up a little */ + scrypt(password, sizeof(password), salt, sizeof(salt), 15, 3, 4, digest, sizeof(digest)); + + cpuflags = available_implementations(); + topbit = 0; + for (i = cpuflags; i != 0; i >>= 1) + topbit++; + topbit = ((size_t)1 << topbit); + + while (1) { + #if defined(SCRYPT_CHOOSE_COMPILETIME) + printf("speed test for scrypt[%s,%s]\n", SCRYPT_HASH, SCRYPT_MIX); + #else + printf("speed test for scrypt[%s,%s,%s]\n", SCRYPT_HASH, SCRYPT_MIX, get_top_cpuflag_desc(cpuflags)); + #endif + + cpu_detect_mask = cpuflags; + for (i = 0; settings[i].desc; i++) { + s = &settings[i]; + minticks = maxticks; + for (passes = 0; passes < 16; passes++) + timeit(scrypt(password, sizeof(password), salt, sizeof(salt), s->Nfactor, s->rfactor, s->pfactor, digest, sizeof(digest)), minticks) + + printf("%s, %.0f ticks\n", s->desc, (double)minticks); + } + + #if defined(SCRYPT_CHOOSE_COMPILETIME) + break; + #else + while (topbit && ((cpuflags & topbit) == 0)) + topbit >>= 1; + cpuflags &= ~topbit; + + /* (cpuflags == 0) is the basic/portable version, don't bother timing it */ + if (!cpuflags) + break; + #endif + } + + printf("\n\n"); + + return 0; +} + diff --git a/scrypt-jane/scrypt-jane-test.c b/scrypt-jane/scrypt-jane-test.c new file mode 100644 index 000000000..a8fbbc8a4 --- /dev/null +++ b/scrypt-jane/scrypt-jane-test.c @@ -0,0 +1,12 @@ +#define SCRYPT_TEST +#include "scrypt-jane.c" + +int main() { + int res = scrypt_power_on_self_test(); + + printf("%s: test %s\n", SCRYPT_MIX, (res & 1) ? "ok" : "FAILED"); + printf("%s: test %s\n", SCRYPT_HASH, (res & 2) ? "ok" : "FAILED"); + printf("scrypt: test vectors %s\n", (res & 4) ? "ok" : "FAILED"); + + return ((res & 7) == 7) ? 0 : 1; +} diff --git a/scrypt-jane/scrypt-jane.c b/scrypt-jane/scrypt-jane.c index 57c600ba7..9e37dbada 100644 --- a/scrypt-jane/scrypt-jane.c +++ b/scrypt-jane/scrypt-jane.c @@ -1,5 +1,5 @@ /* - scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane + scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane Public Domain or MIT License, whichever is easier */ @@ -68,7 +68,7 @@ scrypt_power_on_self_test() { scrypt((uint8_t *)t->pw, strlen(t->pw), (uint8_t *)t->salt, strlen(t->salt), t->Nfactor, t->rfactor, t->pfactor, test_digest, sizeof(test_digest)); scrypt_valid &= scrypt_verify(post_vectors[i], test_digest, sizeof(test_digest)); } - + if (!scrypt_valid) { #if !defined(SCRYPT_TEST) scrypt_fatal_error("scrypt: scrypt power-on-self-test failed"); diff --git a/scrypt-jane/scrypt-jane.h b/scrypt-jane/scrypt-jane.h index a682889cc..1c0df6242 100644 --- a/scrypt-jane/scrypt-jane.h +++ b/scrypt-jane/scrypt-jane.h @@ -2,7 +2,7 @@ #define SCRYPT_JANE_H /* - Nfactor: Increases CPU & Memory Hardness + Nfactor: Increases CPU & Memory Hardness N = (1 << (Nfactor + 1)): How many times to mix a chunk and how many temporary chunks are used rfactor: Increases Memory Hardness diff --git a/scrypt-jane/test-speed.sh b/scrypt-jane/test-speed.sh new file mode 100644 index 000000000..f223dae49 --- /dev/null +++ b/scrypt-jane/test-speed.sh @@ -0,0 +1,38 @@ +#!/bin/sh + +test() { + sleep 0.25 # mingw is stupid and will occasionally not have permission to overwrite scrypt_speed + gcc scrypt-jane-speed.c -O3 -DSCRYPT_$1 -DSCRYPT_$2 $3 -o scrypt_speed 2>/dev/null + local RC=$? + if [ $RC -ne 0 ]; then + echo "$1/$2: failed to compile " + return + fi + ./scrypt_speed +} + +testhash() { + test $1 SALSA $2 + test $1 CHACHA $2 + test $1 SALSA64 $2 +} + +testhashes() { + testhash SHA256 $1 + testhash SHA512 $1 + testhash BLAKE256 $1 + testhash BLAKE512 $1 + testhash SKEIN512 $1 + testhash KECCAK256 $1 + testhash KECCAK512 $1 +} + +if [ -z $1 ]; then + testhashes +elif [ $1 -eq 32 ]; then + testhashes -m32 +elif [ $1 -eq 64 ]; then + testhashes -m64 +fi + +rm -f scrypt_speed \ No newline at end of file diff --git a/scrypt-jane/test.sh b/scrypt-jane/test.sh new file mode 100644 index 000000000..dc3d03251 --- /dev/null +++ b/scrypt-jane/test.sh @@ -0,0 +1,44 @@ +#!/bin/sh + +test() { + sleep 0.25 # mingw is stupid and will occasionally not have permission to overwrite scrypt_test + gcc scrypt-jane-test.c -O3 -DSCRYPT_$1 -DSCRYPT_$2 $3 -o scrypt_test 2>/dev/null + local RC=$? + if [ $RC -ne 0 ]; then + echo "$1/$2: failed to compile " + return + fi + ./scrypt_test >/dev/null + local RC=$? + if [ $RC -ne 0 ]; then + echo "$1/$2: validation failed" + return + fi + echo "$1/$2: OK" +} + +testhash() { + test $1 SALSA $2 + test $1 CHACHA $2 + test $1 SALSA64 $2 +} + +testhashes() { + testhash SHA256 $1 + testhash SHA512 $1 + testhash BLAKE256 $1 + testhash BLAKE512 $1 + testhash SKEIN512 $1 + testhash KECCAK256 $1 + testhash KECCAK512 $1 +} + +if [ -z $1 ]; then + testhashes +elif [ $1 -eq 32 ]; then + testhashes -m32 +elif [ $1 -eq 64 ]; then + testhashes -m64 +fi + +rm -f scrypt_test From 333fddf744538c3d956f9c65be99e5b4cfd0f985 Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 14:39:21 -0300 Subject: [PATCH 06/25] Update yacoin.h --- yacoin.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/yacoin.h b/yacoin.h index 75d8c13e8..f714397f3 100644 --- a/yacoin.h +++ b/yacoin.h @@ -1,3 +1,5 @@ +#ifndef __YACOIN_H__ +#define __YACOIN_H__ /* include the constants and functions needed for YaCoin From de6b6b74d4acc6b5bc6be0f4a4c6386d8fcdcd3c Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 14:40:11 -0300 Subject: [PATCH 07/25] Update yacoin.h --- yacoin.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/yacoin.h b/yacoin.h index f714397f3..91c9a4b55 100644 --- a/yacoin.h +++ b/yacoin.h @@ -7,8 +7,6 @@ pulled out of the original scrypt-jane.c in ali1234's repository */ -const unsigned char minNfactor = 4; -const unsigned char maxNfactor = 30; unsigned char GetNfactor(unsigned int nTimestamp) From 1afb046f4f3ad82bbf47bb9f1eda8fe9c47a09b2 Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 14:40:30 -0300 Subject: [PATCH 08/25] Update yacoin.c --- yacoin.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/yacoin.c b/yacoin.c index 6b6edb12f..60fa56cb2 100644 --- a/yacoin.c +++ b/yacoin.c @@ -1,3 +1,7 @@ + +const unsigned char minNfactor = 4; +const unsigned char maxNfactor = 30; + unsigned char GetNfactor(unsigned int nTimestamp) { int l = 0; From e0f3548975260bf36c2223ba3ddcc96e32d7d948 Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 13:42:54 -0400 Subject: [PATCH 09/25] Update Makefile.am --- Makefile.am | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Makefile.am b/Makefile.am index d8fce4b30..bc2215a51 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,4 +1,3 @@ - if WANT_JANSSON JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson else @@ -7,7 +6,7 @@ endif EXTRA_DIST = example-cfg.json nomacro.pl -SUBDIRS = compat +SUBDIRS = compat scrypt-jane INCLUDES = $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) From 654bcac6cd5528d881dc757e9bdff1c7db649ad9 Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 14:43:24 -0300 Subject: [PATCH 10/25] Update cpu-miner.c --- cpu-miner.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cpu-miner.c b/cpu-miner.c index 2b2fd9ab9..e95e77719 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -36,6 +36,7 @@ #include #include "compat.h" #include "miner.h" +#include "yacoin.h" #define PROGRAM_NAME "minerd" #define DEF_RPC_URL "http://127.0.0.1:9332/" From eef004cb6b49968e93688369d30858645407cf9a Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 14:49:31 -0300 Subject: [PATCH 11/25] Update Makefile.am --- Makefile.am | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.am b/Makefile.am index bc2215a51..89ce8787d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -6,7 +6,7 @@ endif EXTRA_DIST = example-cfg.json nomacro.pl -SUBDIRS = compat scrypt-jane +SUBDIRS = compat INCLUDES = $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) @@ -15,7 +15,7 @@ bin_PROGRAMS = minerd minerd_SOURCES = elist.h miner.h compat.h \ cpu-miner.c util.c \ sha2.c sha2-arm.S sha2-x86.S sha2-x64.S \ - scrypt.c scrypt-arm.S scrypt-x86.S scrypt-x64.S scrypt-jane.c + scrypt.c scrypt-arm.S scrypt-x86.S scrypt-x64.S scrypt-jane/scrypt-jane.c minerd_LDFLAGS = $(PTHREAD_FLAGS) minerd_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ minerd_CPPFLAGS = @LIBCURL_CPPFLAGS@ -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME From 5728a0e4f1822e0faf7a6c5a53f3a0902157b433 Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 14:52:02 -0300 Subject: [PATCH 12/25] Update yacoin.h --- yacoin.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/yacoin.h b/yacoin.h index 91c9a4b55..340b4ff77 100644 --- a/yacoin.h +++ b/yacoin.h @@ -13,3 +13,5 @@ unsigned char GetNfactor(unsigned int nTimestamp) int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) + +#endif /* __YACOIN_H__ */ From 5467b552683f77968b4d2f5573873c5731aab1bd Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 14:55:40 -0300 Subject: [PATCH 13/25] Update yacoin.h --- yacoin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yacoin.h b/yacoin.h index 340b4ff77..928e95e44 100644 --- a/yacoin.h +++ b/yacoin.h @@ -8,10 +8,10 @@ pulled out of the original scrypt-jane.c in ali1234's repository */ -unsigned char GetNfactor(unsigned int nTimestamp) +unsigned char GetNfactor(unsigned int nTimestamp); int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) + uint32_t max_nonce, unsigned long *hashes_done); #endif /* __YACOIN_H__ */ From baf01f116db5baf9dc86f354191339baca89ac55 Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 14:56:55 -0300 Subject: [PATCH 14/25] Update Makefile.am --- Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.am b/Makefile.am index 89ce8787d..81d3f32ea 100644 --- a/Makefile.am +++ b/Makefile.am @@ -15,7 +15,7 @@ bin_PROGRAMS = minerd minerd_SOURCES = elist.h miner.h compat.h \ cpu-miner.c util.c \ sha2.c sha2-arm.S sha2-x86.S sha2-x64.S \ - scrypt.c scrypt-arm.S scrypt-x86.S scrypt-x64.S scrypt-jane/scrypt-jane.c + scrypt.c scrypt-arm.S scrypt-x86.S scrypt-x64.S scrypt-jane/scrypt-jane.c yacoin.c minerd_LDFLAGS = $(PTHREAD_FLAGS) minerd_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ minerd_CPPFLAGS = @LIBCURL_CPPFLAGS@ -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME From aa6abd7c49aad7bf1b01912459617765777cfbb3 Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 14:58:19 -0300 Subject: [PATCH 15/25] Update yacoin.c --- yacoin.c | 1 + 1 file changed, 1 insertion(+) diff --git a/yacoin.c b/yacoin.c index 60fa56cb2..59d40b981 100644 --- a/yacoin.c +++ b/yacoin.c @@ -1,3 +1,4 @@ +#include const unsigned char minNfactor = 4; const unsigned char maxNfactor = 30; From 46eaeae3655fb9cd6f07cc6a2ab6d82bee22c659 Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 15:00:54 -0300 Subject: [PATCH 16/25] Update cpu-miner.c --- cpu-miner.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/cpu-miner.c b/cpu-miner.c index e95e77719..a46606d31 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -134,6 +134,11 @@ static unsigned long accepted_count = 0L; static unsigned long rejected_count = 0L; double *thr_hashrates; +// Constants for YACoin's NFactor +const unsigned char minNfactor = 4; +const unsigned char maxNfactor = 30; + + #ifdef HAVE_GETOPT_LONG #include #else @@ -1152,3 +1157,88 @@ int main(int argc, char *argv[]) return 0; } + +unsigned char GetNfactor(unsigned int nTimestamp) { + int l = 0; + + if (nTimestamp <= 1367991200) + return 4; + + unsigned long int s = nTimestamp - 1367991200; + while ((s >> 1) > 3) { + l += 1; + s >>= 1; + } + + s &= 3; + + int n = (l * 170 + s * 25 - 2320) / 100; + + if (n < 0) n = 0; + + if (n > 255) + printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n); + + unsigned char N = (unsigned char)n; + //printf("GetNfactor: %d -> %d %d : %d / %d\n", nTimestamp - nChainStartTime, l, s, n, min(max(N, minNfa$ + +// return min(max(N, minNfactor), maxNfactor); + + if(NmaxNfactor) return maxNfactor; + return N; +} + +int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t data[20], hash[8], target_swap[8]; + volatile unsigned char *hashc = (unsigned char *) hash; + volatile unsigned char *datac = (unsigned char *) data; + volatile unsigned char *pdatac = (unsigned char *) pdata; + uint32_t n = pdata[19] - 1; + int i; + + /* byte swap it */ + for(int z=0;z<20;z++) { + datac[(z*4) ] = pdatac[(z*4)+3]; + datac[(z*4)+1] = pdatac[(z*4)+2]; + datac[(z*4)+2] = pdatac[(z*4)+1]; + datac[(z*4)+3] = pdatac[(z*4) ]; + } + + int nfactor = GetNfactor(data[17]); + + do { + data[19] = ++n; + + scrypt((unsigned char *)data, 80, + (unsigned char *)data, 80, + nfactor, 0, 0, (unsigned char *)hash, 32); + + if (hashc[31] == 0 && hashc[30] == 0) { +/* + for(int z=7;z>=0;z--) + fprintf(stderr, "%08x ", hash[z]); + fprintf(stderr, "\n"); + + for(int z=7;z>=0;z--) + fprintf(stderr, "%08x ", ptarget[z]); + fprintf(stderr, "\n"); +*/ + if(fulltest(hash, ptarget)) { + *hashes_done = n - pdata[19] + 1; + pdatac[76] = datac[79]; + pdatac[77] = datac[78]; + pdatac[78] = datac[77]; + pdatac[79] = datac[76]; + return 1; + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - pdata[19] + 1; + pdata[19] = n; + return 0; +} From bee50587cc44b62f06fad752006be5399effaf0a Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 15:01:22 -0300 Subject: [PATCH 17/25] Update Makefile.am --- Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.am b/Makefile.am index 81d3f32ea..89ce8787d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -15,7 +15,7 @@ bin_PROGRAMS = minerd minerd_SOURCES = elist.h miner.h compat.h \ cpu-miner.c util.c \ sha2.c sha2-arm.S sha2-x86.S sha2-x64.S \ - scrypt.c scrypt-arm.S scrypt-x86.S scrypt-x64.S scrypt-jane/scrypt-jane.c yacoin.c + scrypt.c scrypt-arm.S scrypt-x86.S scrypt-x64.S scrypt-jane/scrypt-jane.c minerd_LDFLAGS = $(PTHREAD_FLAGS) minerd_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ minerd_CPPFLAGS = @LIBCURL_CPPFLAGS@ -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME From 84c4ce8376cea157301de60b782145e07b3e7549 Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 15:06:31 -0300 Subject: [PATCH 18/25] Update cpu-miner.c --- cpu-miner.c | 1 - 1 file changed, 1 deletion(-) diff --git a/cpu-miner.c b/cpu-miner.c index a46606d31..77dd483a5 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -36,7 +36,6 @@ #include #include "compat.h" #include "miner.h" -#include "yacoin.h" #define PROGRAM_NAME "minerd" #define DEF_RPC_URL "http://127.0.0.1:9332/" From a05d1ba57890a880cb7223e42663565aa17d857b Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 14:08:50 -0400 Subject: [PATCH 19/25] cleaning up unneeded files --- yacoin.c | 89 -------------------------------------------------------- yacoin.h | 17 ----------- 2 files changed, 106 deletions(-) delete mode 100644 yacoin.c delete mode 100644 yacoin.h diff --git a/yacoin.c b/yacoin.c deleted file mode 100644 index 59d40b981..000000000 --- a/yacoin.c +++ /dev/null @@ -1,89 +0,0 @@ -#include - -const unsigned char minNfactor = 4; -const unsigned char maxNfactor = 30; - -unsigned char GetNfactor(unsigned int nTimestamp) { - int l = 0; - - if (nTimestamp <= 1367991200) - return 4; - - unsigned long int s = nTimestamp - 1367991200; - while ((s >> 1) > 3) { - l += 1; - s >>= 1; - } - - s &= 3; - - int n = (l * 170 + s * 25 - 2320) / 100; - - if (n < 0) n = 0; - - if (n > 255) - printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n); - - unsigned char N = (unsigned char)n; - //printf("GetNfactor: %d -> %d %d : %d / %d\n", nTimestamp - nChainStartTime, l, s, n, min(max(N, minNfa$ - -// return min(max(N, minNfactor), maxNfactor); - - if(NmaxNfactor) return maxNfactor; - return N; -} - -int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t data[20], hash[8], target_swap[8]; - volatile unsigned char *hashc = (unsigned char *) hash; - volatile unsigned char *datac = (unsigned char *) data; - volatile unsigned char *pdatac = (unsigned char *) pdata; - uint32_t n = pdata[19] - 1; - int i; - - /* byte swap it */ - for(int z=0;z<20;z++) { - datac[(z*4) ] = pdatac[(z*4)+3]; - datac[(z*4)+1] = pdatac[(z*4)+2]; - datac[(z*4)+2] = pdatac[(z*4)+1]; - datac[(z*4)+3] = pdatac[(z*4) ]; - } - - int nfactor = GetNfactor(data[17]); - - do { - data[19] = ++n; - - scrypt((unsigned char *)data, 80, - (unsigned char *)data, 80, - nfactor, 0, 0, (unsigned char *)hash, 32); - - if (hashc[31] == 0 && hashc[30] == 0) { -/* - for(int z=7;z>=0;z--) - fprintf(stderr, "%08x ", hash[z]); - fprintf(stderr, "\n"); - - for(int z=7;z>=0;z--) - fprintf(stderr, "%08x ", ptarget[z]); - fprintf(stderr, "\n"); -*/ - if(fulltest(hash, ptarget)) { - *hashes_done = n - pdata[19] + 1; - pdatac[76] = datac[79]; - pdatac[77] = datac[78]; - pdatac[78] = datac[77]; - pdatac[79] = datac[76]; - return 1; - } - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - pdata[19] + 1; - pdata[19] = n; - return 0; -} diff --git a/yacoin.h b/yacoin.h deleted file mode 100644 index 928e95e44..000000000 --- a/yacoin.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef __YACOIN_H__ -#define __YACOIN_H__ -/* -include the constants and functions needed for YaCoin - -pulled out of the original scrypt-jane.c in ali1234's repository - -*/ - - -unsigned char GetNfactor(unsigned int nTimestamp); - -int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done); - -#endif /* __YACOIN_H__ */ From 6c27877a5c7c757796c1d2c0597278f1b7836970 Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 16:10:43 -0400 Subject: [PATCH 20/25] moved YACoin routines into yacoin.c --- yacoin.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ yacoin.h | 15 +++++++++ 2 files changed, 111 insertions(+) create mode 100644 yacoin.c create mode 100644 yacoin.h diff --git a/yacoin.c b/yacoin.c new file mode 100644 index 000000000..9a8685506 --- /dev/null +++ b/yacoin.c @@ -0,0 +1,96 @@ +#include "cpuminer-config.h" +#include "miner.h" + +#include +#include +#include + +// Constants for YACoin's NFactor +const unsigned char minNfactor = 4; +const unsigned char maxNfactor = 30; + +unsigned char GetNfactor(unsigned int nTimestamp) { + int l = 0; + + if (nTimestamp <= 1367991200) + return 4; + + unsigned long int s = nTimestamp - 1367991200; + while ((s >> 1) > 3) { + l += 1; + s >>= 1; + } + + s &= 3; + + int n = (l * 170 + s * 25 - 2320) / 100; + + if (n < 0) n = 0; + + if (n > 255) + printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n); + + unsigned char N = (unsigned char)n; + //printf("GetNfactor: %d -> %d %d : %d / %d\n", nTimestamp - nChainStartTime, l, s, n, min(max(N, minNfa$ + +// return min(max(N, minNfactor), maxNfactor); + + if(NmaxNfactor) return maxNfactor; + return N; +} + +int scanhash_yacoin(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t data[20], hash[8], target_swap[8]; + volatile unsigned char *hashc = (unsigned char *) hash; + volatile unsigned char *datac = (unsigned char *) data; + volatile unsigned char *pdatac = (unsigned char *) pdata; + uint32_t n = pdata[19] - 1; + int i; + + /* byte swap it */ + for(int z=0;z<20;z++) { + datac[(z*4) ] = pdatac[(z*4)+3]; + datac[(z*4)+1] = pdatac[(z*4)+2]; + datac[(z*4)+2] = pdatac[(z*4)+1]; + datac[(z*4)+3] = pdatac[(z*4) ]; + } + + int nfactor = GetNfactor(data[17]); + + do { + data[19] = ++n; + + scrypt((unsigned char *)data, 80, + (unsigned char *)data, 80, + nfactor, 0, 0, (unsigned char *)hash, 32); + + if (hashc[31] == 0 && hashc[30] == 0) { +/* + for(int z=7;z>=0;z--) + fprintf(stderr, "%08x ", hash[z]); + fprintf(stderr, "\n"); + + for(int z=7;z>=0;z--) + fprintf(stderr, "%08x ", ptarget[z]); + fprintf(stderr, "\n"); +*/ + if(fulltest(hash, ptarget)) { + *hashes_done = n - pdata[19] + 1; + pdatac[76] = datac[79]; + pdatac[77] = datac[78]; + pdatac[78] = datac[77]; + pdatac[79] = datac[76]; + return 1; + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - pdata[19] + 1; + pdata[19] = n; + return 0; +} + diff --git a/yacoin.h b/yacoin.h new file mode 100644 index 000000000..cf6949ada --- /dev/null +++ b/yacoin.h @@ -0,0 +1,15 @@ +#ifndef __YACOIN_H__ +#define __YACOIN_H__ + +#include +#include +#include + +unsigned char GetNfactor(unsigned int nTimestamp); + +int scanhash_yacoin(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done); + +#endif /* __YACOIN_H__ */ + From 84dd6ce231a1e1fc05b8ca6d695c4a4c8fcb7f88 Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 16:14:54 -0400 Subject: [PATCH 21/25] Updated to use YACoin routines in yacoin.c --- Makefile.am | 2 +- cpu-miner.c | 91 ++--------------------------------------------------- 2 files changed, 3 insertions(+), 90 deletions(-) diff --git a/Makefile.am b/Makefile.am index 89ce8787d..cca2fc9e5 100644 --- a/Makefile.am +++ b/Makefile.am @@ -15,7 +15,7 @@ bin_PROGRAMS = minerd minerd_SOURCES = elist.h miner.h compat.h \ cpu-miner.c util.c \ sha2.c sha2-arm.S sha2-x86.S sha2-x64.S \ - scrypt.c scrypt-arm.S scrypt-x86.S scrypt-x64.S scrypt-jane/scrypt-jane.c + scrypt.c scrypt-arm.S scrypt-x86.S scrypt-x64.S yacoin.c scrypt-jane/scrypt-jane.c minerd_LDFLAGS = $(PTHREAD_FLAGS) minerd_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ minerd_CPPFLAGS = @LIBCURL_CPPFLAGS@ -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME diff --git a/cpu-miner.c b/cpu-miner.c index 77dd483a5..7846477c6 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -36,6 +36,7 @@ #include #include "compat.h" #include "miner.h" +#include "yacoin.h" #define PROGRAM_NAME "minerd" #define DEF_RPC_URL "http://127.0.0.1:9332/" @@ -133,10 +134,6 @@ static unsigned long accepted_count = 0L; static unsigned long rejected_count = 0L; double *thr_hashrates; -// Constants for YACoin's NFactor -const unsigned char minNfactor = 4; -const unsigned char maxNfactor = 30; - #ifdef HAVE_GETOPT_LONG #include @@ -638,7 +635,7 @@ static void *miner_thread(void *userdata) break; case ALGO_SCRYPT_JANE: - rc = scanhash_scrypt_jane(thr_id, work.data, work.target, + rc = scanhash_yacoin(thr_id, work.data, work.target, max_nonce, &hashes_done); break; @@ -1157,87 +1154,3 @@ int main(int argc, char *argv[]) return 0; } -unsigned char GetNfactor(unsigned int nTimestamp) { - int l = 0; - - if (nTimestamp <= 1367991200) - return 4; - - unsigned long int s = nTimestamp - 1367991200; - while ((s >> 1) > 3) { - l += 1; - s >>= 1; - } - - s &= 3; - - int n = (l * 170 + s * 25 - 2320) / 100; - - if (n < 0) n = 0; - - if (n > 255) - printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n); - - unsigned char N = (unsigned char)n; - //printf("GetNfactor: %d -> %d %d : %d / %d\n", nTimestamp - nChainStartTime, l, s, n, min(max(N, minNfa$ - -// return min(max(N, minNfactor), maxNfactor); - - if(NmaxNfactor) return maxNfactor; - return N; -} - -int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t data[20], hash[8], target_swap[8]; - volatile unsigned char *hashc = (unsigned char *) hash; - volatile unsigned char *datac = (unsigned char *) data; - volatile unsigned char *pdatac = (unsigned char *) pdata; - uint32_t n = pdata[19] - 1; - int i; - - /* byte swap it */ - for(int z=0;z<20;z++) { - datac[(z*4) ] = pdatac[(z*4)+3]; - datac[(z*4)+1] = pdatac[(z*4)+2]; - datac[(z*4)+2] = pdatac[(z*4)+1]; - datac[(z*4)+3] = pdatac[(z*4) ]; - } - - int nfactor = GetNfactor(data[17]); - - do { - data[19] = ++n; - - scrypt((unsigned char *)data, 80, - (unsigned char *)data, 80, - nfactor, 0, 0, (unsigned char *)hash, 32); - - if (hashc[31] == 0 && hashc[30] == 0) { -/* - for(int z=7;z>=0;z--) - fprintf(stderr, "%08x ", hash[z]); - fprintf(stderr, "\n"); - - for(int z=7;z>=0;z--) - fprintf(stderr, "%08x ", ptarget[z]); - fprintf(stderr, "\n"); -*/ - if(fulltest(hash, ptarget)) { - *hashes_done = n - pdata[19] + 1; - pdatac[76] = datac[79]; - pdatac[77] = datac[78]; - pdatac[78] = datac[77]; - pdatac[79] = datac[76]; - return 1; - } - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - pdata[19] + 1; - pdata[19] = n; - return 0; -} From 25632c95ace32522affb4904c5405f07ad357994 Mon Sep 17 00:00:00 2001 From: Thirtybird Date: Wed, 5 Jun 2013 16:29:25 -0400 Subject: [PATCH 22/25] fixed missing include in yacoin.c and updated scanhash functionname in miner.h --- miner.h | 2 +- yacoin.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/miner.h b/miner.h index bd04b3962..532cf9221 100644 --- a/miner.h +++ b/miner.h @@ -139,7 +139,7 @@ extern int scanhash_scrypt(int thr_id, uint32_t *pdata, unsigned char *scratchbuf, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); -extern int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, +extern int scanhash_yacoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); diff --git a/yacoin.c b/yacoin.c index 9a8685506..44b7db3cc 100644 --- a/yacoin.c +++ b/yacoin.c @@ -1,5 +1,6 @@ #include "cpuminer-config.h" #include "miner.h" +#include "scrypt-jane/scrypt-jane.h" #include #include From 9ad3ea9edd3706a9b35bad78f57f89d904615e28 Mon Sep 17 00:00:00 2001 From: ThirtyBird Date: Fri, 7 Jun 2013 17:20:18 -0300 Subject: [PATCH 23/25] Updated README with detailed build instructions for MinGW --- README | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/README b/README index fd958512c..a82dec5c3 100644 --- a/README +++ b/README @@ -22,17 +22,37 @@ Notes for AIX users: * GNU-style long options are not supported, but are accessible via configuration file -Basic Windows build instructions, using MinGW: +Detailed Windows build instructions, using MinGW (32-bit): Install MinGW and the MSYS Developer Tool Kit (http://www.mingw.org/) - * Make sure you have mstcpip.h in MinGW\include - If using MinGW-w64, install pthreads-w64 + * Choose C, C++ and MSys on install as select to have it update its libraries + * Install into C:\MinGW + Include mstcpip.h from WINE in your MinGW library + * http://source.winehq.org/source/include/mstcpip.h + * select version 1.3.34 + * copy this code into C:\MinGW\Include\mstcpip.h (strip out the line numbers!) Install libcurl devel (http://curl.haxx.se/download.html) - * Make sure you have libcurl.m4 in MinGW\share\aclocal - * Make sure you have curl-config in MinGW\bin - In the MSYS shell, run: - ./autogen.sh # only needed if building from git repo - LIBCURL="-lcurldll" ./configure CFLAGS="-O3" - make + * download curl-7.30.0.tar.gz from http://curl.haxx.se/download.html and put it in C:\deps\ + * launch an MSYS shell and enter the following commands (the configure step will take a long time!) + cd /c/deps + tar -xvzf curl-7.30.0.tar.gz + cd curl-7.30.0 + ./configure –prefix=/c/mingw + make + make install + * copy c:\deps\curl-7.30.0\docs\libcurl\libcurl.m4 c:\mingw\share\aclocal + * copy c:\deps\curl-7.30.0\curl-config c:\mingw\bin + In the MSYS shell, navigate to the CPUminer source code direcctory + * You will likely get higher hashrates by forcing the compiler to build the executable for your + specific CPU architecture. This is done by adding "-march=" into the CFLAGS. Those + values can be found at http://gcc.gnu.org/onlinedocs/gcc/i386-and-x86_002d64-Options.html + common choices for intel are : core2, corei7, corei7-avx + common choices for AMD are : athlon-fx + * Execute the following (replacing the value of -march with the value for your CPU type) + ./autogen.sh + ./configure CFLAGS="-march=core2 -O3" + make + strip minerd.exe + * copy minerd.exe, C:\MinGW\bin\libcurl-4.dll, and C:\MinGW\bin\pthreadGC2.dll to the same directory Architecture-specific notes: ARM: No runtime CPU detection. The miner can take advantage From fd65a3184664a12e3bc926ae905bea22060d7c9c Mon Sep 17 00:00:00 2001 From: ThirtyBird Date: Tue, 11 Jun 2013 14:41:59 -0400 Subject: [PATCH 24/25] updated latest scrypt-jane from floodyberry copy of commit https://github.com/floodyberry/scrypt-jane/commit/ba3983d0ce7e520989f7e914a5cd05ad4ffeefab to fix compilation under MinGW 64 with CPU_X86_FORCE_INTRINSICS --- scrypt-jane/code/scrypt-jane-mix_chacha-avx.h | 6 +++--- scrypt-jane/code/scrypt-jane-mix_chacha-sse2.h | 6 +++--- scrypt-jane/code/scrypt-jane-mix_chacha-ssse3.h | 6 +++--- scrypt-jane/code/scrypt-jane-mix_salsa-avx.h | 6 +++--- scrypt-jane/code/scrypt-jane-mix_salsa-sse2.h | 6 +++--- scrypt-jane/code/scrypt-jane-mix_salsa64-avx.h | 4 ++-- scrypt-jane/code/scrypt-jane-mix_salsa64-sse2.h | 4 ++-- scrypt-jane/code/scrypt-jane-mix_salsa64-ssse3.h | 4 ++-- scrypt-jane/code/scrypt-jane-portable-x86.h | 11 ++--------- scrypt-jane/code/scrypt-jane-portable.h | 3 +++ 10 files changed, 26 insertions(+), 30 deletions(-) diff --git a/scrypt-jane/code/scrypt-jane-mix_chacha-avx.h b/scrypt-jane/code/scrypt-jane-mix_chacha-avx.h index ab5ed2031..47804943d 100644 --- a/scrypt-jane/code/scrypt-jane-mix_chacha-avx.h +++ b/scrypt-jane/code/scrypt-jane-mix_chacha-avx.h @@ -1,5 +1,5 @@ /* x86 */ -#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) +#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_CHACHA_AVX @@ -142,7 +142,7 @@ asm_naked_fn_end(scrypt_ChunkMix_avx) /* x64 */ -#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) +#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_CHACHA_AVX @@ -261,7 +261,7 @@ asm_naked_fn_end(scrypt_ChunkMix_avx) #define SCRYPT_CHACHA_AVX -static void NOINLINE +static void asm_calling_convention NOINLINE scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { uint32_t i, blocksPerChunk = r * 2, half = 0; xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; diff --git a/scrypt-jane/code/scrypt-jane-mix_chacha-sse2.h b/scrypt-jane/code/scrypt-jane-mix_chacha-sse2.h index d040121e6..e92a84c7c 100644 --- a/scrypt-jane/code/scrypt-jane-mix_chacha-sse2.h +++ b/scrypt-jane/code/scrypt-jane-mix_chacha-sse2.h @@ -1,5 +1,5 @@ /* x86 */ -#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) +#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_CHACHA_SSE2 @@ -136,7 +136,7 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2) /* x64 */ -#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) +#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_CHACHA_SSE2 @@ -261,7 +261,7 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2) #define SCRYPT_CHACHA_SSE2 -static void NOINLINE +static void NOINLINE asm_calling_convention scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { uint32_t i, blocksPerChunk = r * 2, half = 0; xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3; diff --git a/scrypt-jane/code/scrypt-jane-mix_chacha-ssse3.h b/scrypt-jane/code/scrypt-jane-mix_chacha-ssse3.h index b0609f185..39e0a4945 100644 --- a/scrypt-jane/code/scrypt-jane-mix_chacha-ssse3.h +++ b/scrypt-jane/code/scrypt-jane-mix_chacha-ssse3.h @@ -1,5 +1,5 @@ /* x86 */ -#if defined(X86ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) +#if defined(X86ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_CHACHA_SSSE3 @@ -146,7 +146,7 @@ asm_naked_fn_end(scrypt_ChunkMix_ssse3) /* x64 */ -#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) +#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_CHACHA_SSSE3 @@ -269,7 +269,7 @@ asm_naked_fn_end(scrypt_ChunkMix_ssse3) #define SCRYPT_CHACHA_SSSE3 -static void NOINLINE +static void NOINLINE asm_calling_convention scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { uint32_t i, blocksPerChunk = r * 2, half = 0; xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; diff --git a/scrypt-jane/code/scrypt-jane-mix_salsa-avx.h b/scrypt-jane/code/scrypt-jane-mix_salsa-avx.h index 1ca90b5fa..5ee44097e 100644 --- a/scrypt-jane/code/scrypt-jane-mix_salsa-avx.h +++ b/scrypt-jane/code/scrypt-jane-mix_salsa-avx.h @@ -1,5 +1,5 @@ /* x86 */ -#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) +#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_SALSA_AVX @@ -128,7 +128,7 @@ asm_naked_fn_end(scrypt_ChunkMix_avx) /* x64 */ -#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) +#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_SALSA_AVX @@ -245,7 +245,7 @@ asm_naked_fn_end(scrypt_ChunkMix_avx) #define SCRYPT_SALSA_AVX -static void NOINLINE +static void asm_calling_convention NOINLINE scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { uint32_t i, blocksPerChunk = r * 2, half = 0; xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; diff --git a/scrypt-jane/code/scrypt-jane-mix_salsa-sse2.h b/scrypt-jane/code/scrypt-jane-mix_salsa-sse2.h index ecc5f0f8d..70e7b5669 100644 --- a/scrypt-jane/code/scrypt-jane-mix_salsa-sse2.h +++ b/scrypt-jane/code/scrypt-jane-mix_salsa-sse2.h @@ -1,5 +1,5 @@ /* x86 */ -#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) +#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_SALSA_SSE2 @@ -144,7 +144,7 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2) /* x64 */ -#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) +#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_SALSA_SSE2 @@ -277,7 +277,7 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2) #define SCRYPT_SALSA_SSE2 -static void NOINLINE +static void NOINLINE asm_calling_convention scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { uint32_t i, blocksPerChunk = r * 2, half = 0; xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; diff --git a/scrypt-jane/code/scrypt-jane-mix_salsa64-avx.h b/scrypt-jane/code/scrypt-jane-mix_salsa64-avx.h index 50c9902d5..fa23474eb 100644 --- a/scrypt-jane/code/scrypt-jane-mix_salsa64-avx.h +++ b/scrypt-jane/code/scrypt-jane-mix_salsa64-avx.h @@ -1,5 +1,5 @@ /* x64 */ -#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) +#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_SALSA64_AVX @@ -176,7 +176,7 @@ asm_naked_fn_end(scrypt_ChunkMix_avx) /* intrinsic */ -#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_AVX) +#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) #define SCRYPT_SALSA64_AVX diff --git a/scrypt-jane/code/scrypt-jane-mix_salsa64-sse2.h b/scrypt-jane/code/scrypt-jane-mix_salsa64-sse2.h index f8d957432..089527294 100644 --- a/scrypt-jane/code/scrypt-jane-mix_salsa64-sse2.h +++ b/scrypt-jane/code/scrypt-jane-mix_salsa64-sse2.h @@ -1,5 +1,5 @@ /* x64 */ -#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) +#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_SALSA64_SSE2 @@ -220,7 +220,7 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2) /* intrinsic */ -#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSE2) +#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) #define SCRYPT_SALSA64_SSE2 diff --git a/scrypt-jane/code/scrypt-jane-mix_salsa64-ssse3.h b/scrypt-jane/code/scrypt-jane-mix_salsa64-ssse3.h index 105efa83f..9588a6b0b 100644 --- a/scrypt-jane/code/scrypt-jane-mix_salsa64-ssse3.h +++ b/scrypt-jane/code/scrypt-jane-mix_salsa64-ssse3.h @@ -1,5 +1,5 @@ /* x64 */ -#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) +#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_SALSA64_SSSE3 @@ -208,7 +208,7 @@ asm_naked_fn_end(scrypt_ChunkMix_ssse3) /* intrinsic */ -#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSSE3) +#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) #define SCRYPT_SALSA64_SSSE3 diff --git a/scrypt-jane/code/scrypt-jane-portable-x86.h b/scrypt-jane/code/scrypt-jane-portable-x86.h index 192f76497..e23e80aaf 100644 --- a/scrypt-jane/code/scrypt-jane-portable-x86.h +++ b/scrypt-jane/code/scrypt-jane-portable-x86.h @@ -24,7 +24,7 @@ #endif #endif -#if defined(COMPILER_MSVC) +#if defined(COMPILER_MSVC) && (defined(CPU_X86_FORCE_INTRINSICS) || defined(CPU_X86_64)) #define X86_INTRINSIC #if defined(CPU_X86_64) || defined(X86ASM_SSE) #define X86_INTRINSIC_SSE @@ -37,14 +37,6 @@ #endif #endif -#if defined(COMPILER_MSVC) && defined(CPU_X86_64) - #define X86_64USE_INTRINSIC -#endif - -#if defined(COMPILER_MSVC) && defined(CPU_X86_64) - #define X86_64USE_INTRINSIC -#endif - #if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS) #define X86_INTRINSIC #if defined(__SSE__) @@ -147,6 +139,7 @@ #define asm_align16 a1(ALIGN 16) #define asm_calling_convention STDCALL + #define aret(n) a1(ret n) #define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn #define asm_naked_fn(fn) { #define asm_naked_fn_end(fn) } diff --git a/scrypt-jane/code/scrypt-jane-portable.h b/scrypt-jane/code/scrypt-jane-portable.h index 33c8c2cad..31558992e 100644 --- a/scrypt-jane/code/scrypt-jane-portable.h +++ b/scrypt-jane/code/scrypt-jane-portable.h @@ -279,3 +279,6 @@ scrypt_ensure_zero(void *p, size_t len) { #include "scrypt-jane-portable-x86.h" +#if !defined(asm_calling_convention) +#define asm_calling_convention +#endif From d8d44567955cbbbcc1c922f64b405b5a8126530b Mon Sep 17 00:00:00 2001 From: ThirtyBird Date: Tue, 11 Jun 2013 16:48:09 -0300 Subject: [PATCH 25/25] Updated README with detailed build instructions for MinGW 64-bit --- README | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/README b/README index a82dec5c3..82be5fba3 100644 --- a/README +++ b/README @@ -52,8 +52,49 @@ Detailed Windows build instructions, using MinGW (32-bit): ./configure CFLAGS="-march=core2 -O3" make strip minerd.exe + Combine the executables with the dependencies * copy minerd.exe, C:\MinGW\bin\libcurl-4.dll, and C:\MinGW\bin\pthreadGC2.dll to the same directory +Detailed Windows build instructions, using MinGW (64-bit): + Install MinGW and the MSYS Developer Tool Kit (http://www.mingw.org/) + * Choose C, C++ and MSys on install and select to have it update its libraries + * Install into C:\MinGW + * Add C:\MinGW\bin and c:\MinGW\msys\1.0 to your path + Download MinGW64 from http://sourceforge.net/projects/mingw-w64/files/Toolchains%20targetting%20Win64/Automated%20Builds/ + * Choose mingw-w64-bin_i686-mingw_20111220.zip + * Extract ZIP to C:\MinGW64 + * Add C:\MinGW64\bin to your path before C:\MinGW\bin + Install libcurl devel (http://curl.haxx.se/download.html) + * download curl-7.30.0.tar.gz from http://curl.haxx.se/download.html and put it in C:\deps\ + * launch an MSYS shell and enter the following commands (the configure step will take a long time!) + cd /c/deps + tar -xvzf curl-7.30.0.tar.gz + cd curl-7.30.0 + ./configure --host=x86_64-w64-mingw32 –-prefix=/c/mingw64 + make + make install + cp /c/deps/curl-7.30.0/docs/libcurl/libcurl.m4 /c/mingw/share/aclocal/libcurl.m4 + Install pthreads + * download pthreads-20100604.zip from http://sourceforge.net/projects/mingw-w64/files/External%20binary%20packages%20%28Win64%20hosted%29/pthreads/ and put it in C:\deps\ + * unzip the file to c:\deps\ + * In the mingw64 subdirectory is pthreads-w64.zip - extract the contents to C:\MinGW64 + In the MSYS shell, navigate to the CPUminer source code direcctory + * You will likely get higher hashrates by forcing the compiler to build the executable for your + specific CPU architecture. This is done by adding "-march=" into the CFLAGS. Those + values can be found at http://gcc.gnu.org/onlinedocs/gcc/i386-and-x86_002d64-Options.html + common choices for intel are : core2, corei7, corei7-avx + common choices for AMD are : athlon-fx + * Execute the following (replacing the value of -march with the value for your CPU type) + ./autogen.sh + ./configure --host=x86_64-w64-mingw32 CFLAGS="-O3 -march=core2 -DCPU_X86_FORCE_INTRINSICS" + make + Strip minerd.exe + * In a command prompt, in the compilation directory, execute the following + x86_64-w64-mingw32-strip minerd.exe + Combine the executables with the dependencies + * copy minerd.exe, C:\MinGW64\bin\libcurl-4.dll, and C:\MinGW64\bin\pthreadGC2-w64.dll to the same directory + + Architecture-specific notes: ARM: No runtime CPU detection. The miner can take advantage of some instructions specific to ARMv5E and later processors,