From f00a8f46d8fe9d19729cd36438b0fa5a4d308234 Mon Sep 17 00:00:00 2001 From: Alibek Omarov Date: Sat, 22 Jul 2023 04:21:00 +0300 Subject: [PATCH 1/4] client: studio: fix types in AArch64 NEON code --- cl_dll/studio_util.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/cl_dll/studio_util.cpp b/cl_dll/studio_util.cpp index 2d4bad80..c2fb3bed 100644 --- a/cl_dll/studio_util.cpp +++ b/cl_dll/studio_util.cpp @@ -24,9 +24,9 @@ AngleMatrix ==================== */ #if XASH_SIMD_NEON -const uint32x4_t AngleMatrix_sign0 = vreinterpretq_f32_u32(vsetq_lane_u32(0x80000000, vdupq_n_u32(0), 0)); -const uint32x4_t AngleMatrix_sign1 = vreinterpretq_f32_u32(vsetq_lane_u32(0x80000000, vdupq_n_u32(0), 1)); -const uint32x4_t AngleMatrix_sign2 = vreinterpretq_f32_u32(vsetq_lane_u32(0x80000000, vdupq_n_u32(0), 2)); +static const uint32x4_t AngleMatrix_sign0 = vsetq_lane_u32(0x80000000, vdupq_n_u32(0), 0); +static const uint32x4_t AngleMatrix_sign1 = vsetq_lane_u32(0x80000000, vdupq_n_u32(0), 1); +static const uint32x4_t AngleMatrix_sign2 = vsetq_lane_u32(0x80000000, vdupq_n_u32(0), 2); #endif void AngleMatrix( const float *angles, float (*matrix)[4] ) { @@ -254,7 +254,11 @@ AngleQuaternion ==================== */ #if XASH_SIMD_NEON -const float32x4_t AngleQuaternion_sign2 = vzipq_f32(vdupq_n_u32(0x80000000), vdupq_n_u32(0x00000000)).val[0]; // { 0x80000000, 0x00000000, 0x80000000, 0x00000000 }; +static const float32x4_t AngleQuaternion_sign2 = + vzipq_f32( + vreinterpret_f32_u32(vdupq_n_u32(0x80000000)), + vreinterpret_f32_u32(vdupq_n_u32(0x00000000)) + ).val[0]; // { 0x80000000, 0x00000000, 0x80000000, 0x00000000 }; #endif void AngleQuaternion( float *angles, vec4_t quaternion ) { @@ -272,7 +276,7 @@ void AngleQuaternion( float *angles, vec4_t quaternion ) float32x4_t sy_cr_cy_sr = vextq_f32(sr_sy_cr_cy_sp_0_cp_1.val[0], sr_sy_cr_cy_sp_0_cp_1.val[0], 1); float32x4_t cr_cy_sr_sy = vextq_f32(sr_sy_cr_cy_sp_0_cp_1.val[0], sr_sy_cr_cy_sp_0_cp_1.val[0], 2); float32x4_t cy_sr_sy_cr = vextq_f32(sr_sy_cr_cy_sp_0_cp_1.val[0], sr_sy_cr_cy_sp_0_cp_1.val[0], 3); - float32x4_t sp_sp_sp_sp_signed = veorq_u32(sp_sp_sp_sp, AngleQuaternion_sign2); + float32x4_t sp_sp_sp_sp_signed = veorq_u32(vreinterpret_u32_f32(sp_sp_sp_sp), AngleQuaternion_sign2); float32x4_t left = vmulq_f32(vmulq_f32(sr_sy_cr_cy, cp_cp_cp_cp), cy_sr_sy_cr); From b4bd06603e871c54a40e798c6c8a9a6d344e98c8 Mon Sep 17 00:00:00 2001 From: Alibek Omarov Date: Sat, 22 Jul 2023 04:28:51 +0300 Subject: [PATCH 2/4] client: studio: fix NEON vreinterpret types, fix memcpy --- cl_dll/studio_util.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cl_dll/studio_util.cpp b/cl_dll/studio_util.cpp index c2fb3bed..00419a39 100644 --- a/cl_dll/studio_util.cpp +++ b/cl_dll/studio_util.cpp @@ -216,7 +216,7 @@ void ConcatTransforms( float in1[3][4], float in2[3][4], float out[3][4] ) out_reg.val[2] = vfmaq_laneq_f32(out_reg.val[2], in2_reg.val[1], in1_reg.val[2], 1); out_reg.val[2] = vfmaq_laneq_f32(out_reg.val[2], in2_reg.val[2], in1_reg.val[2], 2); - memcpy(&out, &out_reg, sizeof(out)); + memcpy(out, &out_reg, sizeof(float) * 3 * 4); #else out[0][0] = in1[0][0] * in2[0][0] + in1[0][1] * in2[1][0] + in1[0][2] * in2[2][0]; @@ -256,8 +256,8 @@ AngleQuaternion #if XASH_SIMD_NEON static const float32x4_t AngleQuaternion_sign2 = vzipq_f32( - vreinterpret_f32_u32(vdupq_n_u32(0x80000000)), - vreinterpret_f32_u32(vdupq_n_u32(0x00000000)) + vreinterpretq_f32_u32(vdupq_n_u32(0x80000000)), + vreinterpretq_f32_u32(vdupq_n_u32(0x00000000)) ).val[0]; // { 0x80000000, 0x00000000, 0x80000000, 0x00000000 }; #endif void AngleQuaternion( float *angles, vec4_t quaternion ) @@ -276,7 +276,7 @@ void AngleQuaternion( float *angles, vec4_t quaternion ) float32x4_t sy_cr_cy_sr = vextq_f32(sr_sy_cr_cy_sp_0_cp_1.val[0], sr_sy_cr_cy_sp_0_cp_1.val[0], 1); float32x4_t cr_cy_sr_sy = vextq_f32(sr_sy_cr_cy_sp_0_cp_1.val[0], sr_sy_cr_cy_sp_0_cp_1.val[0], 2); float32x4_t cy_sr_sy_cr = vextq_f32(sr_sy_cr_cy_sp_0_cp_1.val[0], sr_sy_cr_cy_sp_0_cp_1.val[0], 3); - float32x4_t sp_sp_sp_sp_signed = veorq_u32(vreinterpret_u32_f32(sp_sp_sp_sp), AngleQuaternion_sign2); + float32x4_t sp_sp_sp_sp_signed = veorq_u32(vreinterpretq_u32_f32(sp_sp_sp_sp), AngleQuaternion_sign2); float32x4_t left = vmulq_f32(vmulq_f32(sr_sy_cr_cy, cp_cp_cp_cp), cy_sr_sy_cr); From d4995df92ceb921fbfbd46f3f6f41fbd2bcac8bc Mon Sep 17 00:00:00 2001 From: Alibek Omarov Date: Sat, 22 Jul 2023 04:36:03 +0300 Subject: [PATCH 3/4] client: studio: another NEON type fix for AngleQuaternion, static-ize global NEON vectors --- cl_dll/studio_util.cpp | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/cl_dll/studio_util.cpp b/cl_dll/studio_util.cpp index 00419a39..fc241b88 100644 --- a/cl_dll/studio_util.cpp +++ b/cl_dll/studio_util.cpp @@ -254,11 +254,7 @@ AngleQuaternion ==================== */ #if XASH_SIMD_NEON -static const float32x4_t AngleQuaternion_sign2 = - vzipq_f32( - vreinterpretq_f32_u32(vdupq_n_u32(0x80000000)), - vreinterpretq_f32_u32(vdupq_n_u32(0x00000000)) - ).val[0]; // { 0x80000000, 0x00000000, 0x80000000, 0x00000000 }; +static const uint32x4_t AngleQuaternion_sign2 = vzipq_u32(vdupq_n_u32(0x80000000), vdupq_n_u32(0x00000000)).val[0]; // { 0x80000000, 0x00000000, 0x80000000, 0x00000000 }; #endif void AngleQuaternion( float *angles, vec4_t quaternion ) { @@ -411,12 +407,12 @@ QuaternionMatrix ==================== */ #if XASH_SIMD_NEON -const uint32x4_t QuaternionMatrix_sign1 = vsetq_lane_u32(0x80000000, vdupq_n_u32(0x00000000), 0); // { 0x80000000, 0x00000000, 0x00000000, 0x00000000 }; -const uint32x4_t QuaternionMatrix_sign2 = vsetq_lane_u32(0x80000000, vdupq_n_u32(0x00000000), 1); // { 0x00000000, 0x80000000, 0x00000000, 0x00000000 }; -const uint32x4_t QuaternionMatrix_sign3 = vsetq_lane_u32(0x00000000, vdupq_n_u32(0x80000000), 2); // { 0x80000000, 0x80000000, 0x00000000, 0x80000000 }; -const float32x4_t matrix3x4_identity_0 = vsetq_lane_f32(1, vdupq_n_f32(0), 0); // { 1, 0, 0, 0 } -const float32x4_t matrix3x4_identity_1 = vsetq_lane_f32(1, vdupq_n_f32(0), 1); // { 0, 1, 0, 0 } -const float32x4_t matrix3x4_identity_2 = vsetq_lane_f32(1, vdupq_n_f32(0), 2); // { 0, 0, 1, 0 } +static const uint32x4_t QuaternionMatrix_sign1 = vsetq_lane_u32(0x80000000, vdupq_n_u32(0x00000000), 0); // { 0x80000000, 0x00000000, 0x00000000, 0x00000000 }; +static const uint32x4_t QuaternionMatrix_sign2 = vsetq_lane_u32(0x80000000, vdupq_n_u32(0x00000000), 1); // { 0x00000000, 0x80000000, 0x00000000, 0x00000000 }; +static const uint32x4_t QuaternionMatrix_sign3 = vsetq_lane_u32(0x00000000, vdupq_n_u32(0x80000000), 2); // { 0x80000000, 0x80000000, 0x00000000, 0x80000000 }; +static const float32x4_t matrix3x4_identity_0 = vsetq_lane_f32(1, vdupq_n_f32(0), 0); // { 1, 0, 0, 0 } +static const float32x4_t matrix3x4_identity_1 = vsetq_lane_f32(1, vdupq_n_f32(0), 1); // { 0, 1, 0, 0 } +static const float32x4_t matrix3x4_identity_2 = vsetq_lane_f32(1, vdupq_n_f32(0), 2); // { 0, 0, 1, 0 } #endif void QuaternionMatrix( vec4_t quaternion, float (*matrix)[4] ) From 43710ea984094aef985c49a4ec30146cec3aeb5e Mon Sep 17 00:00:00 2001 From: Alibek Omarov Date: Sat, 22 Jul 2023 04:45:59 +0300 Subject: [PATCH 4/4] client: studio: NEON vector types all over place here --- cl_dll/studio_util.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cl_dll/studio_util.cpp b/cl_dll/studio_util.cpp index fc241b88..d1dbeb7f 100644 --- a/cl_dll/studio_util.cpp +++ b/cl_dll/studio_util.cpp @@ -272,7 +272,7 @@ void AngleQuaternion( float *angles, vec4_t quaternion ) float32x4_t sy_cr_cy_sr = vextq_f32(sr_sy_cr_cy_sp_0_cp_1.val[0], sr_sy_cr_cy_sp_0_cp_1.val[0], 1); float32x4_t cr_cy_sr_sy = vextq_f32(sr_sy_cr_cy_sp_0_cp_1.val[0], sr_sy_cr_cy_sp_0_cp_1.val[0], 2); float32x4_t cy_sr_sy_cr = vextq_f32(sr_sy_cr_cy_sp_0_cp_1.val[0], sr_sy_cr_cy_sp_0_cp_1.val[0], 3); - float32x4_t sp_sp_sp_sp_signed = veorq_u32(vreinterpretq_u32_f32(sp_sp_sp_sp), AngleQuaternion_sign2); + float32x4_t sp_sp_sp_sp_signed = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(sp_sp_sp_sp), AngleQuaternion_sign2)); float32x4_t left = vmulq_f32(vmulq_f32(sr_sy_cr_cy, cp_cp_cp_cp), cy_sr_sy_cr);