diff --git a/preset_kernels/Llama3-8B-1.58-100B-tokens/bitnet-lut-kernels-tl1.h b/preset_kernels/Llama3-8B-1.58-100B-tokens/bitnet-lut-kernels-tl1.h index 024fb7874..a648a7713 100644 --- a/preset_kernels/Llama3-8B-1.58-100B-tokens/bitnet-lut-kernels-tl1.h +++ b/preset_kernels/Llama3-8B-1.58-100B-tokens/bitnet-lut-kernels-tl1.h @@ -31,7 +31,7 @@ void per_tensor_quant(int k, void* lut_scales_, void* b_) {{ float32x4_t abssum = vabsq_f32(vec_bs); temp_max = vmaxq_f32(abssum, temp_max); }} - float32_t scales = 127 / vmaxvq_f32(temp_max); + float32_t scales = 127 / (vmaxvq_f32(temp_max) + 1e-10f); *lut_scales = scales; #elif defined __AVX2__ __m256 max_vec = _mm256_set1_ps(0.f); @@ -45,7 +45,7 @@ void per_tensor_quant(int k, void* lut_scales_, void* b_) {{ __m128 max1 = _mm_max_ps(_mm256_extractf128_ps(max_vec, 1), _mm256_castps256_ps128(max_vec)); max1 = _mm_max_ps(max1, _mm_movehl_ps(max1, max1)); max1 = _mm_max_ss(max1, _mm_movehdup_ps(max1)); - float scales = 127 / _mm_cvtss_f32(max1); + float scales = 127 / (_mm_cvtss_f32(max1) + 1e-10f); *lut_scales = scales; #endif }} diff --git a/preset_kernels/Llama3-8B-1.58-100B-tokens/bitnet-lut-kernels-tl2.h b/preset_kernels/Llama3-8B-1.58-100B-tokens/bitnet-lut-kernels-tl2.h index 88dc9e2a0..7a46a0374 100644 --- a/preset_kernels/Llama3-8B-1.58-100B-tokens/bitnet-lut-kernels-tl2.h +++ b/preset_kernels/Llama3-8B-1.58-100B-tokens/bitnet-lut-kernels-tl2.h @@ -82,7 +82,7 @@ inline int32_t per_tensor_quant(int k, void* lut_scales_, void* b_) { __m128 max1 = _mm_max_ps(_mm256_extractf128_ps(max_vec, 1), _mm256_castps256_ps128(max_vec)); max1 = _mm_max_ps(max1, _mm_movehl_ps(max1, max1)); max1 = _mm_max_ss(max1, _mm_movehdup_ps(max1)); - float scales = 127 / _mm_cvtss_f32(max1); + float scales = 127 / (_mm_cvtss_f32(max1) + 1e-10f); *lut_scales = scales; #endif return 0; diff --git a/preset_kernels/bitnet_b1_58-3B/bitnet-lut-kernels-tl1.h b/preset_kernels/bitnet_b1_58-3B/bitnet-lut-kernels-tl1.h index 3f3f551b8..d71467933 100644 --- a/preset_kernels/bitnet_b1_58-3B/bitnet-lut-kernels-tl1.h +++ b/preset_kernels/bitnet_b1_58-3B/bitnet-lut-kernels-tl1.h @@ -31,7 +31,7 @@ void per_tensor_quant(int k, void* lut_scales_, void* b_) {{ float32x4_t abssum = vabsq_f32(vec_bs); temp_max = vmaxq_f32(abssum, temp_max); }} - float32_t scales = 127 / vmaxvq_f32(temp_max); + float32_t scales = 127 / (vmaxvq_f32(temp_max) + 1e-10f); *lut_scales = scales; #elif defined __AVX2__ __m256 max_vec = _mm256_set1_ps(0.f); @@ -45,7 +45,7 @@ void per_tensor_quant(int k, void* lut_scales_, void* b_) {{ __m128 max1 = _mm_max_ps(_mm256_extractf128_ps(max_vec, 1), _mm256_castps256_ps128(max_vec)); max1 = _mm_max_ps(max1, _mm_movehl_ps(max1, max1)); max1 = _mm_max_ss(max1, _mm_movehdup_ps(max1)); - float scales = 127 / _mm_cvtss_f32(max1); + float scales = 127 / (_mm_cvtss_f32(max1) + 1e-10f); *lut_scales = scales; #endif }} diff --git a/preset_kernels/bitnet_b1_58-3B/bitnet-lut-kernels-tl2.h b/preset_kernels/bitnet_b1_58-3B/bitnet-lut-kernels-tl2.h index 678b0f32b..4e0c689dd 100644 --- a/preset_kernels/bitnet_b1_58-3B/bitnet-lut-kernels-tl2.h +++ b/preset_kernels/bitnet_b1_58-3B/bitnet-lut-kernels-tl2.h @@ -82,7 +82,7 @@ inline int32_t per_tensor_quant(int k, void* lut_scales_, void* b_) { __m128 max1 = _mm_max_ps(_mm256_extractf128_ps(max_vec, 1), _mm256_castps256_ps128(max_vec)); max1 = _mm_max_ps(max1, _mm_movehl_ps(max1, max1)); max1 = _mm_max_ss(max1, _mm_movehdup_ps(max1)); - float scales = 127 / _mm_cvtss_f32(max1); + float scales = 127 / (_mm_cvtss_f32(max1) + 1e-10f); *lut_scales = scales; #endif return 0; diff --git a/preset_kernels/bitnet_b1_58-large/bitnet-lut-kernels-tl1.h b/preset_kernels/bitnet_b1_58-large/bitnet-lut-kernels-tl1.h index d38806b5e..636bb702d 100644 --- a/preset_kernels/bitnet_b1_58-large/bitnet-lut-kernels-tl1.h +++ b/preset_kernels/bitnet_b1_58-large/bitnet-lut-kernels-tl1.h @@ -31,7 +31,7 @@ void per_tensor_quant(int k, void* lut_scales_, void* b_) {{ float32x4_t abssum = vabsq_f32(vec_bs); temp_max = vmaxq_f32(abssum, temp_max); }} - float32_t scales = 127 / vmaxvq_f32(temp_max); + float32_t scales = 127 / (vmaxvq_f32(temp_max) + 1e-10f); *lut_scales = scales; #elif defined __AVX2__ __m256 max_vec = _mm256_set1_ps(0.f); @@ -45,7 +45,7 @@ void per_tensor_quant(int k, void* lut_scales_, void* b_) {{ __m128 max1 = _mm_max_ps(_mm256_extractf128_ps(max_vec, 1), _mm256_castps256_ps128(max_vec)); max1 = _mm_max_ps(max1, _mm_movehl_ps(max1, max1)); max1 = _mm_max_ss(max1, _mm_movehdup_ps(max1)); - float scales = 127 / _mm_cvtss_f32(max1); + float scales = 127 / (_mm_cvtss_f32(max1) + 1e-10f); *lut_scales = scales; #endif }} diff --git a/preset_kernels/bitnet_b1_58-large/bitnet-lut-kernels-tl2.h b/preset_kernels/bitnet_b1_58-large/bitnet-lut-kernels-tl2.h index 92bda56b4..5e3a4f79a 100644 --- a/preset_kernels/bitnet_b1_58-large/bitnet-lut-kernels-tl2.h +++ b/preset_kernels/bitnet_b1_58-large/bitnet-lut-kernels-tl2.h @@ -82,7 +82,7 @@ inline int32_t per_tensor_quant(int k, void* lut_scales_, void* b_) { __m128 max1 = _mm_max_ps(_mm256_extractf128_ps(max_vec, 1), _mm256_castps256_ps128(max_vec)); max1 = _mm_max_ps(max1, _mm_movehl_ps(max1, max1)); max1 = _mm_max_ss(max1, _mm_movehdup_ps(max1)); - float scales = 127 / _mm_cvtss_f32(max1); + float scales = 127 / (_mm_cvtss_f32(max1) + 1e-10f); *lut_scales = scales; #endif return 0;