-
Notifications
You must be signed in to change notification settings - Fork 13.8k
ggml-hexagon: fix swiglu failure at test-backend-ops
#17344
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
ab75281
5aa4a83
a64154c
a8cdbcf
ae42fb6
39445ab
a589b61
6f57b9e
db9e930
ce48af5
fc5f31f
5707384
54235e3
3859465
014ad77
8c37457
33a050e
83884a5
f7662f3
5f553f0
f6d7f3c
37e9a1d
185dc20
6d88789
e07cbd6
55cea09
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,19 @@ | |
| #include "hvx-utils.h" | ||
| #include "ops-utils.h" | ||
|
|
||
| static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec) { | ||
| static const float kInf = INFINITY; | ||
| static const float kMaxExp = 88.02f; // log(INF) | ||
|
|
||
| const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); | ||
| const HVX_Vector inf = hvx_vec_splat_fp32(kInf); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thought we can move this init out of the for loop below. |
||
| const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp); | ||
|
|
||
| HVX_Vector out = hvx_vec_exp_fp32(in_vec); | ||
|
|
||
| return Q6_V_vmux_QVV(pred0, inf, out); | ||
| } | ||
|
|
||
| void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) { | ||
| int left_over = num_elems & (VLEN_FP32 - 1); | ||
| int num_elems_whole = num_elems - left_over; | ||
|
|
@@ -42,9 +55,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int | |
| for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { | ||
| if (true == negate) { | ||
| HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++); | ||
| *p_vec_out++ = hvx_vec_exp_fp32(neg_vec_in); | ||
| *p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in); | ||
| } else { | ||
| *p_vec_out++ = hvx_vec_exp_fp32(*p_vec_in1++); | ||
| *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++); | ||
| } | ||
| } | ||
| } else { | ||
|
|
@@ -54,9 +67,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int | |
|
|
||
| if (true == negate) { | ||
| HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in); | ||
| *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(neg_vec_in); | ||
| *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in); | ||
| } else { | ||
| *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(in); | ||
| *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in); | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -70,9 +83,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int | |
| if (true == negate) { | ||
| HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in); | ||
|
|
||
| vec_out = hvx_vec_exp_fp32(neg_vec_in); | ||
| vec_out = hvx_vec_exp_fp32_guard(neg_vec_in); | ||
| } else { | ||
| vec_out = hvx_vec_exp_fp32(in); | ||
| vec_out = hvx_vec_exp_fp32_guard(in); | ||
| } | ||
|
|
||
| hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,15 @@ | |
| #define VLEN_FP32 (VLEN / SIZEOF_FP32) | ||
| #define VLEN_FP16 (VLEN / SIZEOF_FP16) | ||
|
|
||
| typedef union { | ||
| HVX_Vector v; | ||
| uint8_t b[VLEN]; | ||
| uint16_t h[VLEN_FP16]; | ||
| uint32_t w[VLEN_FP32]; | ||
| __fp16 fp16[VLEN_FP16]; | ||
| float fp32[VLEN_FP32]; | ||
| } __attribute__((aligned(VLEN), packed)) HVX_VectorAlias; | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. its safe to use the gcc ext since in htp we're using clang. |
||
|
|
||
| static inline HVX_Vector hvx_vec_splat_fp32(float i) { | ||
| union { | ||
| float f; | ||
|
|
@@ -243,19 +252,16 @@ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint3 | |
| } | ||
|
|
||
| static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) { | ||
| union { | ||
| HVX_Vector v; | ||
| __fp16 d[64]; | ||
| } u = { .v = v }; | ||
| HVX_VectorAlias u = { .v = v }; | ||
|
|
||
| const uint32_t n0 = n / 16; | ||
| const uint32_t n1 = n % 16; | ||
| int i = 0; | ||
| for (; i < n0; i++) { | ||
| htp_dump_fp16_line(pref, u.d + (16 * i), 16); | ||
| htp_dump_fp16_line(pref, u.fp16 + (16 * i), 16); | ||
| } | ||
| if (n1) { | ||
| htp_dump_fp16_line(pref, u.d + (16 * i), n1); | ||
| htp_dump_fp16_line(pref, u.fp16 + (16 * i), n1); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -411,8 +417,8 @@ static inline HVX_Vector hvx_vec_fp32_reduce_sum_n(HVX_Vector in, unsigned int n | |
|
|
||
| HVX_Vector sum = in, sum_t; | ||
| while (width < total) { | ||
| sum_t = Q6_V_vror_VR(sum, width); // rotate right | ||
| sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t)); // elementwise sum | ||
| sum_t = Q6_V_vror_VR(sum, width); // rotate right | ||
| sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t)); // elementwise sum | ||
| width = width << 1; | ||
| } | ||
| return sum; | ||
|
|
@@ -491,7 +497,7 @@ static inline HVX_Vector hvx_vec_abs_fp16(HVX_Vector v) { | |
| static inline HVX_Vector hvx_vec_neg_fp16(HVX_Vector v) { | ||
| // neg by setting the fp16 sign bit | ||
| HVX_Vector mask = Q6_Vh_vsplat_R(0x8000); | ||
| return Q6_V_vor_VV(v, mask); | ||
| return Q6_V_vxor_VV(v, mask); | ||
| } | ||
|
|
||
| static inline HVX_Vector hvx_vec_abs_fp32(HVX_Vector v) { | ||
|
|
@@ -506,7 +512,7 @@ static inline HVX_Vector hvx_vec_neg_fp32(HVX_Vector v) { | |
| #else | ||
| // neg by setting the fp32 sign bit | ||
| HVX_Vector mask = Q6_V_vsplat_R(0x80000000); | ||
| return Q6_V_vor_VV(v, mask); | ||
| return Q6_V_vxor_VV(v, mask); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. using |
||
| #endif // __HTP_ARCH__ > 75 | ||
| } | ||
|
|
||
|
|
@@ -720,6 +726,24 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) { | |
| return Q6_Vsf_equals_Vqf32(r_qf); | ||
| } | ||
|
|
||
| static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) { | ||
| static const float kInf = INFINITY; | ||
| static const uint32_t kNanMask = 0x7fffffff; | ||
| static const uint32_t kNanMin = 0x7f800000; | ||
|
|
||
| const HVX_Vector inf = hvx_vec_splat_fp32(kInf); | ||
| const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf); | ||
|
|
||
| HVX_Vector out = hvx_vec_inverse_fp32(v_sf); | ||
|
|
||
| const HVX_Vector nan_mask = Q6_V_vsplat_R(kNanMask); | ||
| const HVX_Vector nan_min = Q6_V_vsplat_R(kNanMin); | ||
| HVX_Vector masked_out = Q6_V_vand_VV(out, nan_mask); | ||
| const HVX_VectorPred pred = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, masked_out); | ||
|
|
||
| return Q6_V_vmux_QVV(pred, out, Q6_V_vzero()); | ||
| } | ||
|
|
||
| #define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022 | ||
| #define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777 | ||
| #define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267 | ||
|
|
@@ -934,6 +958,16 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) { | |
| return Q6_Vsf_equals_Vqf32(temp); | ||
| } | ||
|
|
||
| static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) { | ||
| static const float kMaxExp = -88.02f; // log(INF) | ||
|
|
||
| const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp)); | ||
| const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(v, max_exp); | ||
|
|
||
| HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v); | ||
| return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero()); | ||
| } | ||
|
|
||
| static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) { | ||
| int step_of_1 = num_elems >> 5; | ||
| int remaining = num_elems - step_of_1 * VLEN_FP32; | ||
|
|
@@ -945,7 +979,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * | |
|
|
||
| #pragma unroll(4) | ||
| for (int i = 0; i < step_of_1; i++) { | ||
| v_dst[i] = hvx_vec_fast_sigmoid_fp32(v_src[i]); | ||
| v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]); | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
looks we should use
src0->ne[0]here instead ofdst->ne[0]