Skip to content

Commit b050834

Browse files
committed
Add intel simd [skip ci]
1 parent f36afb8 commit b050834

12 files changed

+671
-202
lines changed

benchmark_analysis.ipynb

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "b49ae6d6",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"import pandas as pd\n",
11+
"import numpy as np\n",
12+
"import matplotlib.pyplot as plt\n",
13+
"import seaborn as sns\n",
14+
"\n",
15+
"plt.rcParams['figure.figsize'] = (16, 10)\n",
16+
"plt.rcParams['font.size'] = 11"
17+
]
18+
},
19+
{
20+
"cell_type": "code",
21+
"execution_count": null,
22+
"id": "d236980d",
23+
"metadata": {},
24+
"outputs": [],
25+
"source": [
26+
"def parse_csv(filepath):\n",
27+
" with open(filepath, 'r') as f:\n",
28+
" lines = f.readlines()[1:]\n",
29+
" \n",
30+
" data = []\n",
31+
" for line in lines:\n",
32+
" line = line.strip()\n",
33+
" if line and ',' in line and not line.endswith(','):\n",
34+
" parts = line.split(',')\n",
35+
" if len(parts) >= 3:\n",
36+
" try:\n",
37+
" data.append({'Benchmark': parts[0].strip(), 'Time': float(parts[2])})\n",
38+
" except:\n",
39+
" continue\n",
40+
" return pd.DataFrame(data)\n",
41+
"\n",
42+
"baseline = parse_csv('BASELINE_bench.csv')\n",
43+
"custom = parse_csv('CUSTOM_AVX2_bench.csv')\n",
44+
"\n",
45+
"merged = baseline.merge(custom, on='Benchmark', suffixes=('_baseline', '_custom'))\n",
46+
"merged['improvement'] = ((merged['Time_baseline'] - merged['Time_custom']) / merged['Time_baseline']) * 100"
47+
]
48+
},
49+
{
50+
"cell_type": "code",
51+
"execution_count": null,
52+
"id": "8442b12d",
53+
"metadata": {},
54+
"outputs": [],
55+
"source": [
56+
"sorted_data = merged.sort_values('improvement', ascending=False)\n",
57+
"top10 = sorted_data.head(10)\n",
58+
"bottom10 = sorted_data.tail(10)\n",
59+
"filtered = pd.concat([top10, bottom10])"
60+
]
61+
},
62+
{
63+
"cell_type": "code",
64+
"execution_count": null,
65+
"id": "aa07550a",
66+
"metadata": {},
67+
"outputs": [],
68+
"source": [
69+
"heatmap_data = filtered.set_index('Benchmark')[['improvement']]\n",
70+
"\n",
71+
"plt.figure(figsize=(8, 12))\n",
72+
"sns.heatmap(heatmap_data, annot=True, fmt='.1f', cmap='RdYlGn', center=0, \n",
73+
" cbar_kws={'label': 'Performance Improvement (%)'})\n",
74+
"plt.title('CUSTOM_AVX2 vs BASELINE Performance (Top/Bottom 10)', fontsize=14, fontweight='bold')\n",
75+
"plt.ylabel('')\n",
76+
"plt.tight_layout()\n",
77+
"plt.show()"
78+
]
79+
}
80+
],
81+
"metadata": {
82+
"kernelspec": {
83+
"display_name": "Python 3",
84+
"language": "python",
85+
"name": "python3"
86+
},
87+
"language_info": {
88+
"codemirror_mode": {
89+
"name": "ipython",
90+
"version": 3
91+
},
92+
"file_extension": ".py",
93+
"mimetype": "text/x-python",
94+
"name": "python",
95+
"nbconvert_exporter": "python",
96+
"pygments_lexer": "ipython3",
97+
"version": "3.12.3"
98+
}
99+
},
100+
"nbformat": 4,
101+
"nbformat_minor": 5
102+
}

simd-bench.sh

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#!/bin/bash
2+
set -e
3+
4+
options=("OFF" "ON")
5+
BENCH_ITERS=${SECP256K1_BENCH_ITERS:-20000}
6+
7+
GREEN='\033[0;32m'
8+
RED='\033[0;31m'
9+
YELLOW='\033[1;33m'
10+
NC='\033[0m'
11+
12+
echo 1 | sudo tee /sys/devices/system/cpu/intel_pstate/no_turbo > /dev/null
13+
sudo cpupower -c 0 frequency-set -g performance > /dev/null
14+
command -v taskset > /dev/null && TASKSET_CMD="taskset -c 0"
15+
16+
run_bench() {
17+
local dir=$1 bin=$2 log=$3
18+
(
19+
cd "$dir"
20+
$TASKSET_CMD env SECP256K1_BENCH_ITERS=$BENCH_ITERS nice -n 0 ./bin/$bin >> "../../$log" 2>&1
21+
echo "" >> "../../$log"
22+
)
23+
}
24+
25+
bench_all() {
26+
local config="$1"
27+
local dir="build/$config"
28+
local log="${config}_bench.csv"
29+
30+
if [[ ! -d "$dir" ]]; then
31+
echo -e "${RED}$config${NC} (no dir)"
32+
return 1
33+
fi
34+
35+
{
36+
echo "Benchmark results for $config"
37+
echo "Generated on $(date)"
38+
echo "Iterations: $BENCH_ITERS"
39+
echo ""
40+
} > "$log"
41+
42+
for bin in bench bench_ecmult bench_internal; do
43+
if run_bench "$dir" "$bin" "$log"; then
44+
echo -e " ${GREEN}$bin${NC}"
45+
else
46+
echo -e " ${RED}$bin${NC}"
47+
return 1
48+
fi
49+
done
50+
51+
echo -e "${GREEN}$config${NC} (log: $log)"
52+
}
53+
54+
bench_all "BASELINE"
55+
bench_all "CUSTOM_AVX2"
56+
57+
echo -e "\n${YELLOW}All benchmarks successful. Logs in project root${NC}"

simd-build.sh

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/bin/bash
2+
set -e
3+
4+
mkdir -p build
5+
6+
GREEN='\033[0;32m'
7+
RED='\033[0;31m'
8+
YELLOW='\033[1;33m'
9+
NC='\033[0m'
10+
11+
run_build() {
12+
local config="$1"
13+
local flags="-O3 -mavx2 $2"
14+
local dir="build/$config"
15+
local log="${config}_build.log"
16+
17+
mkdir -p "$dir"
18+
19+
if (cd "$dir" && cmake ../.. -G Ninja -DCMAKE_BUILD_TYPE=Release -DSECP256K1_APPEND_CFLAGS="$flags" >"../../$log" 2>&1 && ninja >>"../../$log" 2>&1); then
20+
echo -e "${GREEN}$config${NC}"
21+
else
22+
echo -e "${RED}$config failed${NC}"
23+
return 1
24+
fi
25+
}
26+
27+
run_build "BASELINE" "-U__AVX2__"
28+
run_build "CUSTOM_AVX2" "-D__AVX2__"
29+
30+
echo -e "\n${YELLOW}All builds done. Logs in project root${NC}"

simd-test.sh

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/bin/bash
2+
set -e
3+
4+
GREEN='\033[0;32m'
5+
RED='\033[0;31m'
6+
YELLOW='\033[1;33m'
7+
NC='\033[0m'
8+
9+
run_test() {
10+
local config="$1"
11+
local dir="build/$config"
12+
local log="${config}_test.log"
13+
14+
if [[ ! -d "$dir" ]]; then
15+
echo -e "${RED}$config${NC} (no dir)"
16+
return 1
17+
fi
18+
19+
if (cd "$dir" && ctest --output-on-failure -j"$(nproc)" &> "../../$log"); then
20+
echo -e "${GREEN}$config${NC} (log: $log)"
21+
else
22+
echo -e "${RED}$config${NC} (log: $log)"
23+
return 1
24+
fi
25+
}
26+
27+
run_test "BASELINE"
28+
run_test "CUSTOM_AVX2"
29+
30+
echo -e "\n${YELLOW}All tests passed. Logs in project root${NC}"

src/field_10x26_impl.h

Lines changed: 55 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212
#include "field.h"
1313
#include "modinv32_impl.h"
1414

15+
#ifdef X86
16+
# include <immintrin.h>
17+
#endif
18+
1519
#ifdef VERIFY
1620
static void secp256k1_fe_impl_verify(const secp256k1_fe *a) {
1721
const uint32_t *d = a->n;
@@ -38,16 +42,20 @@ static void secp256k1_fe_impl_verify(const secp256k1_fe *a) {
3842
#endif
3943

4044
static void secp256k1_fe_impl_get_bounds(secp256k1_fe *r, int m) {
41-
r->n[0] = 0x3FFFFFFUL * 2 * m;
42-
r->n[1] = 0x3FFFFFFUL * 2 * m;
43-
r->n[2] = 0x3FFFFFFUL * 2 * m;
44-
r->n[3] = 0x3FFFFFFUL * 2 * m;
45-
r->n[4] = 0x3FFFFFFUL * 2 * m;
46-
r->n[5] = 0x3FFFFFFUL * 2 * m;
47-
r->n[6] = 0x3FFFFFFUL * 2 * m;
48-
r->n[7] = 0x3FFFFFFUL * 2 * m;
49-
r->n[8] = 0x3FFFFFFUL * 2 * m;
50-
r->n[9] = 0x03FFFFFUL * 2 * m;
45+
const uint64_t two_m = 2 * m;
46+
const uint64_t bound1 = 0x3FFFFFFUL * two_m;
47+
const uint64_t bound2 = 0x03FFFFFUL * two_m;
48+
49+
r->n[0] = bound1;
50+
r->n[1] = bound1;
51+
r->n[2] = bound1;
52+
r->n[3] = bound1;
53+
r->n[4] = bound1;
54+
r->n[5] = bound1;
55+
r->n[6] = bound1;
56+
r->n[7] = bound1;
57+
r->n[8] = bound1;
58+
r->n[9] = bound2;
5159
}
5260

5361
static void secp256k1_fe_impl_normalize(secp256k1_fe *r) {
@@ -257,8 +265,8 @@ static int secp256k1_fe_impl_normalizes_to_zero_var(const secp256k1_fe *r) {
257265
}
258266

259267
SECP256K1_INLINE static void secp256k1_fe_impl_set_int(secp256k1_fe *r, int a) {
268+
memset(r->n, 0, sizeof(r->n));
260269
r->n[0] = a;
261-
r->n[1] = r->n[2] = r->n[3] = r->n[4] = r->n[5] = r->n[6] = r->n[7] = r->n[8] = r->n[9] = 0;
262270
}
263271

264272
SECP256K1_INLINE static int secp256k1_fe_impl_is_zero(const secp256k1_fe *a) {
@@ -272,12 +280,11 @@ SECP256K1_INLINE static int secp256k1_fe_impl_is_odd(const secp256k1_fe *a) {
272280

273281
static int secp256k1_fe_impl_cmp_var(const secp256k1_fe *a, const secp256k1_fe *b) {
274282
int i;
283+
int diff;
275284
for (i = 9; i >= 0; i--) {
276-
if (a->n[i] > b->n[i]) {
277-
return 1;
278-
}
279-
if (a->n[i] < b->n[i]) {
280-
return -1;
285+
diff = (a->n[i] > b->n[i]) - (a->n[i] < b->n[i]);
286+
if (diff != 0) {
287+
return diff;
281288
}
282289
}
283290
return 0;
@@ -338,24 +345,32 @@ static void secp256k1_fe_impl_get_b32(unsigned char *r, const secp256k1_fe *a) {
338345
}
339346

340347
SECP256K1_INLINE static void secp256k1_fe_impl_negate_unchecked(secp256k1_fe *r, const secp256k1_fe *a, int m) {
348+
const uint32_t two_m1 = 2 * (m + 1);
349+
350+
const uint32_t bound1 = 0x3FFFC2FUL * two_m1;
351+
const uint32_t bound2 = 0x3FFFFBFUL * two_m1;
352+
const uint32_t bound3 = 0x3FFFFFFUL * two_m1;
353+
const uint32_t bound4 = 0x03FFFFFUL * two_m1;
354+
341355
/* For all legal values of m (0..31), the following properties hold: */
342-
VERIFY_CHECK(0x3FFFC2FUL * 2 * (m + 1) >= 0x3FFFFFFUL * 2 * m);
343-
VERIFY_CHECK(0x3FFFFBFUL * 2 * (m + 1) >= 0x3FFFFFFUL * 2 * m);
344-
VERIFY_CHECK(0x3FFFFFFUL * 2 * (m + 1) >= 0x3FFFFFFUL * 2 * m);
345-
VERIFY_CHECK(0x03FFFFFUL * 2 * (m + 1) >= 0x03FFFFFUL * 2 * m);
356+
VERIFY_CHECK(bound1 >= 0x3FFFFFFUL * 2 * m);
357+
VERIFY_CHECK(bound2 >= 0x3FFFFFFUL * 2 * m);
358+
VERIFY_CHECK(bound3 >= 0x3FFFFFFUL * 2 * m);
359+
VERIFY_CHECK(bound4 >= 0x03FFFFFUL * 2 * m);
346360

347361
/* Due to the properties above, the left hand in the subtractions below is never less than
348362
* the right hand. */
349-
r->n[0] = 0x3FFFC2FUL * 2 * (m + 1) - a->n[0];
350-
r->n[1] = 0x3FFFFBFUL * 2 * (m + 1) - a->n[1];
351-
r->n[2] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[2];
352-
r->n[3] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[3];
353-
r->n[4] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[4];
354-
r->n[5] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[5];
355-
r->n[6] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[6];
356-
r->n[7] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[7];
357-
r->n[8] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[8];
358-
r->n[9] = 0x03FFFFFUL * 2 * (m + 1) - a->n[9];
363+
364+
r->n[0] = bound1 - a->n[0];
365+
r->n[1] = bound2 - a->n[1];
366+
r->n[2] = bound3 - a->n[2];
367+
r->n[3] = bound3 - a->n[3];
368+
r->n[4] = bound3 - a->n[4];
369+
r->n[5] = bound3 - a->n[5];
370+
r->n[6] = bound3 - a->n[6];
371+
r->n[7] = bound3 - a->n[7];
372+
r->n[8] = bound3 - a->n[8];
373+
r->n[9] = bound4 - a->n[9];
359374
}
360375

361376
SECP256K1_INLINE static void secp256k1_fe_impl_mul_int_unchecked(secp256k1_fe *r, int a) {
@@ -1111,26 +1126,26 @@ static SECP256K1_INLINE void secp256k1_fe_storage_cmov(secp256k1_fe_storage *r,
11111126
}
11121127

11131128
static void secp256k1_fe_impl_to_storage(secp256k1_fe_storage *r, const secp256k1_fe *a) {
1114-
r->n[0] = a->n[0] | a->n[1] << 26;
1115-
r->n[1] = a->n[1] >> 6 | a->n[2] << 20;
1129+
r->n[0] = a->n[0] | a->n[1] << 26;
1130+
r->n[1] = a->n[1] >> 6 | a->n[2] << 20;
11161131
r->n[2] = a->n[2] >> 12 | a->n[3] << 14;
11171132
r->n[3] = a->n[3] >> 18 | a->n[4] << 8;
11181133
r->n[4] = a->n[4] >> 24 | a->n[5] << 2 | a->n[6] << 28;
1119-
r->n[5] = a->n[6] >> 4 | a->n[7] << 22;
1134+
r->n[5] = a->n[6] >> 4 | a->n[7] << 22;
11201135
r->n[6] = a->n[7] >> 10 | a->n[8] << 16;
11211136
r->n[7] = a->n[8] >> 16 | a->n[9] << 10;
11221137
}
11231138

11241139
static SECP256K1_INLINE void secp256k1_fe_impl_from_storage(secp256k1_fe *r, const secp256k1_fe_storage *a) {
11251140
r->n[0] = a->n[0] & 0x3FFFFFFUL;
1126-
r->n[1] = a->n[0] >> 26 | ((a->n[1] << 6) & 0x3FFFFFFUL);
1127-
r->n[2] = a->n[1] >> 20 | ((a->n[2] << 12) & 0x3FFFFFFUL);
1128-
r->n[3] = a->n[2] >> 14 | ((a->n[3] << 18) & 0x3FFFFFFUL);
1129-
r->n[4] = a->n[3] >> 8 | ((a->n[4] << 24) & 0x3FFFFFFUL);
1141+
r->n[1] = a->n[0] >> 26 | ((a->n[1] << 6) & 0x3FFFFFFUL);
1142+
r->n[2] = a->n[1] >> 20 | ((a->n[2] << 12) & 0x3FFFFFFUL);
1143+
r->n[3] = a->n[2] >> 14 | ((a->n[3] << 18) & 0x3FFFFFFUL);
1144+
r->n[4] = a->n[3] >> 8 | ((a->n[4] << 24) & 0x3FFFFFFUL);
11301145
r->n[5] = (a->n[4] >> 2) & 0x3FFFFFFUL;
1131-
r->n[6] = a->n[4] >> 28 | ((a->n[5] << 4) & 0x3FFFFFFUL);
1132-
r->n[7] = a->n[5] >> 22 | ((a->n[6] << 10) & 0x3FFFFFFUL);
1133-
r->n[8] = a->n[6] >> 16 | ((a->n[7] << 16) & 0x3FFFFFFUL);
1146+
r->n[6] = a->n[4] >> 28 | ((a->n[5] << 4) & 0x3FFFFFFUL);
1147+
r->n[7] = a->n[5] >> 22 | ((a->n[6] << 10) & 0x3FFFFFFUL);
1148+
r->n[8] = a->n[6] >> 16 | ((a->n[7] << 16) & 0x3FFFFFFUL);
11341149
r->n[9] = a->n[7] >> 10;
11351150
}
11361151

0 commit comments

Comments
 (0)