Use NEON intrinsics in Vec4 dot operation (#598)

This commit is contained in:
PabloMK7 2025-03-01 23:24:19 +01:00 committed by GitHub
parent c7d31dda9f
commit 8149302187
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,3 +1,4 @@
// Copyright Citra Emulator Project / Azahar Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
@ -30,6 +31,10 @@
#pragma once
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
#include <cmath>
#include <cstring>
#include <type_traits>
@ -682,6 +687,23 @@ template <typename T>
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}
template <>
[[nodiscard]] inline float Dot(const Vec4<float>& a, const Vec4<float>& b) {
#ifdef __ARM_NEON
float32x4_t va = vld1q_f32(a.AsArray());
float32x4_t vb = vld1q_f32(b.AsArray());
float32x4_t result = vmulq_f32(va, vb);
#if defined(__aarch64__) // Use vaddvq_f32 in ARMv8 architectures
return vaddvq_f32(result);
#else // Use manual addition for older architectures
float32x2_t sum2 = vadd_f32(vget_high_f32(result), vget_low_f32(result));
return vget_lane_f32(vpadd_f32(sum2, sum2), 0);
#endif
#else
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
#endif
}
template <typename T>
[[nodiscard]] constexpr Vec3<decltype(T{} * T{} - T{} * T{})> Cross(const Vec3<T>& a,
const Vec3<T>& b) {