From 814930218799068cd85829f72bf2ab134954e0f5 Mon Sep 17 00:00:00 2001 From: PabloMK7 Date: Sat, 1 Mar 2025 23:24:19 +0100 Subject: [PATCH] Use NEON intrinsics in Vec4 dot operation (#598) --- src/common/vector_math.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/common/vector_math.h b/src/common/vector_math.h index 51f33859e..ba6a2bd79 100644 --- a/src/common/vector_math.h +++ b/src/common/vector_math.h @@ -1,3 +1,4 @@ +// Copyright Citra Emulator Project / Azahar Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. @@ -30,6 +31,10 @@ #pragma once +#ifdef __ARM_NEON +#include +#endif + #include #include #include @@ -682,6 +687,23 @@ template return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; } +template <> +[[nodiscard]] inline float Dot(const Vec4& a, const Vec4& b) { +#ifdef __ARM_NEON + float32x4_t va = vld1q_f32(a.AsArray()); + float32x4_t vb = vld1q_f32(b.AsArray()); + float32x4_t result = vmulq_f32(va, vb); +#if defined(__aarch64__) // Use vaddvq_f32 in ARMv8 architectures + return vaddvq_f32(result); +#else // Use manual addition for older architectures + float32x2_t sum2 = vadd_f32(vget_high_f32(result), vget_low_f32(result)); + return vget_lane_f32(vpadd_f32(sum2, sum2), 0); +#endif +#else + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +#endif +} + template [[nodiscard]] constexpr Vec3 Cross(const Vec3& a, const Vec3& b) {