// This file is modified from the Ares N64 emulator core. Ares can // be found at https://github.com/ares-emulator/ares. The original license // for this portion of Ares is as follows: // ---------------------------------------------------------------------- // ares // // Copyright(c) 2004 - 2021 ares team, Near et al // // Permission to use, copy, modify, and /or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright noticeand this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS.IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // ---------------------------------------------------------------------- #include #include using u32 = uint32_t; #define ACCH vpu.acch #define ACCM vpu.accm #define ACCL vpu.accl #define VCOH vpu.vcoh #define VCOL vpu.vcol #define VCCH vpu.vcch #define VCCL vpu.vccl #define VCE vpu.vce #define DIVIN vpu.divin #define DIVOUT vpu.divout #define DIVDP vpu.divdp auto RSP::r128::operator()(u32 index) const -> r128 { if constexpr (Accuracy::RSP::SISD) { r128 v{ *this }; switch (index) { case 0: break; case 1: break; case 2: v.u16(1) = v.u16(0); v.u16(3) = v.u16(2); v.u16(5) = v.u16(4); v.u16(7) = v.u16(6); break; case 3: v.u16(0) = v.u16(1); v.u16(2) = v.u16(3); v.u16(4) = v.u16(5); v.u16(6) = v.u16(7); break; case 4: v.u16(1) = v.u16(2) = v.u16(3) = v.u16(0); v.u16(5) = v.u16(6) = v.u16(7) = v.u16(4); break; case 5: v.u16(0) = v.u16(2) = v.u16(3) = v.u16(1); v.u16(4) = v.u16(6) = v.u16(7) = v.u16(5); break; case 6: v.u16(0) = v.u16(1) = v.u16(3) = v.u16(2); v.u16(4) = v.u16(5) = v.u16(7) = v.u16(6); break; case 7: v.u16(0) = v.u16(1) = v.u16(2) = v.u16(3); v.u16(4) = v.u16(5) = v.u16(6) = v.u16(7); break; case 8: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(0); break; case 9: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(1); break; case 10: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(2); break; case 11: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(3); break; case 12: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(4); break; case 13: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(5); break; case 14: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(6); break; case 15: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(7); break; } return v; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 static const __m128i shuffle[16] = { //vector _mm_set_epi8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), //01234567 _mm_set_epi8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), //01234567 //scalar quarter _mm_set_epi8(15,14,15,14,11,10,11,10, 7, 6, 7, 6, 3, 2, 3, 2), //00224466 _mm_set_epi8(13,12,13,12, 9, 8, 9, 8, 5, 4, 5, 4, 1, 0, 1, 0), //11335577 //scalar half _mm_set_epi8(15,14,15,14,15,14,15,14, 7, 6, 7, 6, 7, 6, 7, 6), //00004444 _mm_set_epi8(13,12,13,12,13,12,13,12, 5, 4, 5, 4, 5, 4, 5, 4), //11115555 _mm_set_epi8(11,10,11,10,11,10,11,10, 3, 2, 3, 2, 3, 2, 3, 2), //22226666 _mm_set_epi8(9, 8, 9, 8, 9, 8, 9, 8, 1, 0, 1, 0, 1, 0, 1, 0), //33337777 //scalar whole _mm_set_epi8(15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14), //00000000 _mm_set_epi8(13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12), //11111111 _mm_set_epi8(11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10), //22222222 _mm_set_epi8(9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8), //33333333 _mm_set_epi8(7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6), //44444444 _mm_set_epi8(5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4), //55555555 _mm_set_epi8(3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2), //66666666 _mm_set_epi8(1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0), //77777777 }; //todo: benchmark to see if testing for cases 0&1 to return value directly is faster r128 ret; ret.v128 = _mm_shuffle_epi8(v128, shuffle[index]); return ret; #endif } } auto RSP::accumulatorGet(u32 index) const -> u64 { return (u64)ACCH.u16(index) << 32 | (u64)ACCM.u16(index) << 16 | (u64)ACCL.u16(index) << 0; } auto RSP::accumulatorSet(u32 index, u64 value) -> void { ACCH.u16(index) = value >> 32; ACCM.u16(index) = value >> 16; ACCL.u16(index) = value >> 0; } auto RSP::accumulatorSaturate(u32 index, bool slice, u16 negative, u16 positive) const -> u16 { if (ACCH.s16(index) < 0) { if (ACCH.u16(index) != 0xffff) return negative; if (ACCM.s16(index) >= 0) return negative; } else { if (ACCH.u16(index) != 0x0000) return positive; if (ACCM.s16(index) < 0) return positive; } return !slice ? ACCL.u16(index) : ACCM.u16(index); } auto RSP::CFC2(r32& rt, u8 rd) -> void { r128 hi, lo; switch (rd & 3) { case 0x00: hi = VCOH; lo = VCOL; break; case 0x01: hi = VCCH; lo = VCCL; break; case 0x02: hi = zero; lo = VCE; break; case 0x03: hi = zero; lo = VCE; break; //unverified } if constexpr (Accuracy::RSP::SISD) { rt = 0; for (u32 n = 0; n < 8; n++) { rt |= lo.get(n) << (0 + n); rt |= hi.get(n) << (8 + n); } rt = s16(rt); } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 static const v128 reverse = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); rt = s16(_mm_movemask_epi8(_mm_shuffle_epi8(_mm_packs_epi16(hi, lo), reverse))); #endif } } auto RSP::CTC2(cr32& rt, u8 rd) -> void { r128* hi; r128* lo; r128 null; switch (rd & 3) { case 0x00: hi = &VCOH; lo = &VCOL; break; case 0x01: hi = &VCCH; lo = &VCCL; break; case 0x02: hi = &null; lo = &VCE; break; case 0x03: hi = &null; lo = &VCE; break; //unverified } if constexpr (Accuracy::RSP::SISD) { for (u32 n = 0; n < 8; n++) { lo->set(n, rt & 1 << (0 + n)); hi->set(n, rt & 1 << (8 + n)); } } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 static const v128 mask = _mm_set_epi16(0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080); lo->v128 = _mm_cmpeq_epi8(_mm_and_si128(_mm_shuffle_epi8(r128{ ~rt >> 0 }, zero), mask), zero); hi->v128 = _mm_cmpeq_epi8(_mm_and_si128(_mm_shuffle_epi8(r128{ ~rt >> 8 }, zero), mask), zero); #endif } } template auto RSP::LBV(r128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm; vt.byte(e) = RSP_MEM_B(0, address); } template auto RSP::LDV(r128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 8; auto start = e; auto end = std::min(start + 8, 16); for (u32 offset = start; offset < end; offset++) { vt.byte(offset & 15) = RSP_MEM_B(0, address++); } } template auto RSP::LFV(r128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 16; auto index = (address & 7) - e; address &= ~7; auto start = e; auto end = std::min(start + 8, 16); r128 tmp; for (u32 offset = 0; offset < 4; offset++) { tmp.element(offset + 0) = RSP_MEM_B(0, address + (index + offset * 4 + 0 & 15)) << 7; tmp.element(offset + 4) = RSP_MEM_B(0, address + (index + offset * 4 + 8 & 15)) << 7; } for (u32 offset = start; offset < end; offset++) { vt.byte(offset) = tmp.byte(offset); } } template auto RSP::LHV(r128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 16; auto index = (address & 7) - e; address &= ~7; for (u32 offset = 0; offset < 8; offset++) { vt.element(offset) = RSP_MEM_B(0, address + (index + offset * 2 & 15)) << 7; } } template auto RSP::LLV(r128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 4; auto start = e; auto end = std::min(start + 4, 16); for (u32 offset = start; offset < end; offset++) { vt.byte(offset & 15) = RSP_MEM_B(0, address++); } } template auto RSP::LPV(r128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 8; auto index = (address & 7) - e; address &= ~7; for (u32 offset = 0; offset < 8; offset++) { vt.element(offset) = RSP_MEM_B(0, address + (index + offset & 15)) << 8; } } template auto RSP::LQV(r128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 16; auto start = e; auto end = std::min((u32)(16 + e - (address & 15)), (u32)16); for (u32 offset = start; offset < end; offset++) { vt.byte(offset & 15) = RSP_MEM_B(0, address++); } } template auto RSP::LRV(r128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 16; auto index = e; auto start = 16 - ((address & 15) - index); address &= ~15; for (u32 offset = start; offset < 16; offset++) { vt.byte(offset & 15) = RSP_MEM_B(0, address++); } } template auto RSP::LSV(r128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 2; auto start = e; auto end = std::min(start + 2, 16); for (u32 offset = start; offset < end; offset++) { vt.byte(offset & 15) = RSP_MEM_B(0, address++); } } template auto RSP::LTV(u8 vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 16; auto begin = address & ~7; address = begin + ((e + (address & 8)) & 15); auto vtbase = vt & ~7; auto vtoff = e >> 1; for (u32 i = 0; i < 8; i++) { vpu.r[vtbase + vtoff].byte(i * 2 + 0) = RSP_MEM_B(0, address++); if (address == begin + 16) address = begin; vpu.r[vtbase + vtoff].byte(i * 2 + 1) = RSP_MEM_B(0, address++); if (address == begin + 16) address = begin; vtoff = vtoff + 1 & 7; } } template auto RSP::LUV(r128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 8; auto index = (address & 7) - e; address &= ~7; for (u32 offset = 0; offset < 8; offset++) { vt.element(offset) = RSP_MEM_B(0, address + (index + offset & 15)) << 7; } } template auto RSP::LWV(r128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 16; auto start = 16 - e; auto end = e + 16; for (u32 offset = start; offset < end; offset++) { vt.byte(offset & 15) = RSP_MEM_B(0, address); address += 4; } } template auto RSP::MFC2(r32& rt, cr128& vs) -> void { auto hi = vs.byte(e + 0 & 15); auto lo = vs.byte(e + 1 & 15); rt = s16(hi << 8 | lo << 0); } template auto RSP::MTC2(cr32& rt, r128& vs) -> void { vs.byte(e + 0) = rt >> 8; if (e != 15) vs.byte(e + 1) = rt >> 0; } template auto RSP::SBV(cr128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm; RSP_MEM_B(0, address) = vt.byte(e); } template auto RSP::SDV(cr128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 8; auto start = e; auto end = start + 8; for (u32 offset = start; offset < end; offset++) { RSP_MEM_B(0, address++) = vt.byte(offset & 15); } } template auto RSP::SFV(cr128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 16; auto base = address & 7; address &= ~7; switch (e) { case 0: case 15: RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(0) >> 7; RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(1) >> 7; RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(2) >> 7; RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(3) >> 7; break; case 1: RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(6) >> 7; RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(7) >> 7; RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(4) >> 7; RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(5) >> 7; break; case 4: RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(1) >> 7; RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(2) >> 7; RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(3) >> 7; RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(0) >> 7; break; case 5: RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(7) >> 7; RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(4) >> 7; RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(5) >> 7; RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(6) >> 7; break; case 8: RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(4) >> 7; RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(5) >> 7; RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(6) >> 7; RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(7) >> 7; break; case 11: RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(3) >> 7; RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(0) >> 7; RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(1) >> 7; RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(2) >> 7; break; case 12: RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(5) >> 7; RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(6) >> 7; RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(7) >> 7; RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(4) >> 7; break; default: RSP_MEM_B(0, address + (base + 0 & 15)) = 0; RSP_MEM_B(0, address + (base + 4 & 15)) = 0; RSP_MEM_B(0, address + (base + 8 & 15)) = 0; RSP_MEM_B(0, address + (base + 12 & 15)) = 0; break; } } template auto RSP::SHV(cr128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 16; auto index = address & 7; address &= ~7; for (u32 offset = 0; offset < 8; offset++) { auto byte = e + offset * 2; auto value = vt.byte(byte + 0 & 15) << 1 | vt.byte(byte + 1 & 15) >> 7; RSP_MEM_B(0, address + (index + offset * 2 & 15)) = value; } } template auto RSP::SLV(cr128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 4; auto start = e; auto end = start + 4; for (u32 offset = start; offset < end; offset++) { RSP_MEM_B(0, address++) = vt.byte(offset & 15); } } template auto RSP::SPV(cr128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 8; auto start = e; auto end = start + 8; for (u32 offset = start; offset < end; offset++) { if ((offset & 15) < 8) { RSP_MEM_B(0, address++) = vt.byte((offset & 7) << 1); } else { RSP_MEM_B(0, address++) = vt.element(offset & 7) >> 7; } } } template auto RSP::SQV(cr128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 16; auto start = e; auto end = start + (16 - (address & 15)); for (u32 offset = start; offset < end; offset++) { RSP_MEM_B(0, address++) = vt.byte(offset & 15); } } template auto RSP::SRV(cr128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 16; auto start = e; auto end = start + (address & 15); auto base = 16 - (address & 15); address &= ~15; for (u32 offset = start; offset < end; offset++) { RSP_MEM_B(0, address++) = vt.byte(offset + base & 15); } } template auto RSP::SSV(cr128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 2; auto start = e; auto end = start + 2; for (u32 offset = start; offset < end; offset++) { RSP_MEM_B(0, address++) = vt.byte(offset & 15); } } template auto RSP::STV(u8 vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 16; auto start = vt & ~7; auto end = start + 8; auto element = 16 - (e & ~1); auto base = (address & 7) - (e & ~1); address &= ~7; for (u32 offset = start; offset < end; offset++) { RSP_MEM_B(0, address + (base++ & 15)) = vpu.r[offset].byte(element++ & 15); RSP_MEM_B(0, address + (base++ & 15)) = vpu.r[offset].byte(element++ & 15); } } template auto RSP::SUV(cr128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 8; auto start = e; auto end = start + 8; for (u32 offset = start; offset < end; offset++) { if ((offset & 15) < 8) { RSP_MEM_B(0, address++) = vt.element(offset & 7) >> 7; } else { RSP_MEM_B(0, address++) = vt.byte((offset & 7) << 1); } } } template auto RSP::SWV(cr128& vt, cr32& rs, s8 imm) -> void { auto address = rs + imm * 16; auto start = e; auto end = start + 16; auto base = address & 7; address &= ~7; for (u32 offset = start; offset < end; offset++) { RSP_MEM_B(0, address + (base++ & 15)) = vt.byte(offset & 15); } } template auto RSP::VABS(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { r128 vte = vt(e); for (u32 n = 0; n < 8; n++) { if (vs.s16(n) < 0) { if (vte.s16(n) == -32768) { ACCL.s16(n) = -32768; vd.s16(n) = 32767; } else { ACCL.s16(n) = -vte.s16(n); vd.s16(n) = -vte.s16(n); } } else if (vs.s16(n) > 0) { ACCL.s16(n) = +vte.s16(n); vd.s16(n) = +vte.s16(n); } else { ACCL.s16(n) = 0; vd.s16(n) = 0; } } } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vs0, slt; vs0 = _mm_cmpeq_epi16(vs, zero); slt = _mm_srai_epi16(vs, 15); vd = _mm_andnot_si128(vs0, vt(e)); vd = _mm_xor_si128(vd, slt); ACCL = _mm_sub_epi16(vd, slt); vd = _mm_subs_epi16(vd, slt); #endif } } template auto RSP::VADD(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { s32 result = vs.s16(n) + vte.s16(n) + VCOL.get(n); ACCL.s16(n) = result; vd.s16(n) = sclamp<16>(result); } VCOL = zero; VCOH = zero; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), sum, min, max; sum = _mm_add_epi16(vs, vte); ACCL = _mm_sub_epi16(sum, VCOL); min = _mm_min_epi16(vs, vte); max = _mm_max_epi16(vs, vte); min = _mm_subs_epi16(min, VCOL); vd = _mm_adds_epi16(min, max); VCOL = zero; VCOH = zero; #endif } } template auto RSP::VADDC(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { u32 result = vs.u16(n) + vte.u16(n); ACCL.u16(n) = result; VCOL.set(n, result >> 16); } VCOH = zero; vd = ACCL; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), sum; sum = _mm_adds_epu16(vs, vte); ACCL = _mm_add_epi16(vs, vte); VCOL = _mm_cmpeq_epi16(sum, ACCL); VCOL = _mm_cmpeq_epi16(VCOL, zero); VCOH = zero; vd = ACCL; #endif } } template auto RSP::VAND(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { r128 vte = vt(e); for (u32 n = 0; n < 8; n++) { ACCL.u16(n) = vs.u16(n) & vte.u16(n); } vd = ACCL; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 ACCL = _mm_and_si128(vs, vt(e)); vd = ACCL; #endif } } template auto RSP::VCH(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { if ((vs.s16(n) ^ vte.s16(n)) < 0) { s16 result = vs.s16(n) + vte.s16(n); ACCL.s16(n) = (result <= 0 ? -vte.s16(n) : vs.s16(n)); VCCL.set(n, result <= 0); VCCH.set(n, vte.s16(n) < 0); VCOL.set(n, 1); VCOH.set(n, result != 0 && vs.u16(n) != (vte.u16(n) ^ 0xffff)); VCE.set(n, result == -1); } else { s16 result = vs.s16(n) - vte.s16(n); ACCL.s16(n) = (result >= 0 ? vte.s16(n) : vs.s16(n)); VCCL.set(n, vte.s16(n) < 0); VCCH.set(n, result >= 0); VCOL.set(n, 0); VCOH.set(n, result != 0 && vs.u16(n) != (vte.u16(n) ^ 0xffff)); VCE.set(n, 0); } } vd = ACCL; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), nvt, diff, diff0, vtn, dlez, dgez, mask; VCOL = _mm_xor_si128(vs, vte); VCOL = _mm_cmplt_epi16(VCOL, zero); nvt = _mm_xor_si128(vte, VCOL); nvt = _mm_sub_epi16(nvt, VCOL); diff = _mm_sub_epi16(vs, nvt); diff0 = _mm_cmpeq_epi16(diff, zero); vtn = _mm_cmplt_epi16(vte, zero); dlez = _mm_cmpgt_epi16(diff, zero); dgez = _mm_or_si128(dlez, diff0); dlez = _mm_cmpeq_epi16(zero, dlez); VCCH = _mm_blendv_epi8(dgez, vtn, VCOL); VCCL = _mm_blendv_epi8(vtn, dlez, VCOL); VCE = _mm_cmpeq_epi16(diff, VCOL); VCE = _mm_and_si128(VCE, VCOL); VCOH = _mm_or_si128(diff0, VCE); VCOH = _mm_cmpeq_epi16(VCOH, zero); mask = _mm_blendv_epi8(VCCH, VCCL, VCOL); ACCL = _mm_blendv_epi8(vs, nvt, mask); vd = ACCL; #endif } } template auto RSP::VCL(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { if (VCOL.get(n)) { if (VCOH.get(n)) { ACCL.u16(n) = VCCL.get(n) ? -vte.u16(n) : vs.u16(n); } else { u16 sum = vs.u16(n) + vte.u16(n); bool carry = (vs.u16(n) + vte.u16(n)) != sum; if (VCE.get(n)) { ACCL.u16(n) = VCCL.set(n, (!sum || !carry)) ? -vte.u16(n) : vs.u16(n); } else { ACCL.u16(n) = VCCL.set(n, (!sum && !carry)) ? -vte.u16(n) : vs.u16(n); } } } else { if (VCOH.get(n)) { ACCL.u16(n) = VCCH.get(n) ? vte.u16(n) : vs.u16(n); } else { ACCL.u16(n) = VCCH.set(n, (s32)vs.u16(n) - (s32)vte.u16(n) >= 0) ? vte.u16(n) : vs.u16(n); } } } VCOL = zero; VCOH = zero; VCE = zero; vd = ACCL; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), nvt, diff, ncarry, nvce, diff0, lec1, lec2, leeq, geeq, le, ge, mask; nvt = _mm_xor_si128(vte, VCOL); nvt = _mm_sub_epi16(nvt, VCOL); diff = _mm_sub_epi16(vs, nvt); ncarry = _mm_adds_epu16(vs, vte); ncarry = _mm_cmpeq_epi16(diff, ncarry); nvce = _mm_cmpeq_epi16(VCE, zero); diff0 = _mm_cmpeq_epi16(diff, zero); lec1 = _mm_and_si128(diff0, ncarry); lec1 = _mm_and_si128(nvce, lec1); lec2 = _mm_or_si128(diff0, ncarry); lec2 = _mm_and_si128(VCE, lec2); leeq = _mm_or_si128(lec1, lec2); geeq = _mm_subs_epu16(vte, vs); geeq = _mm_cmpeq_epi16(geeq, zero); le = _mm_andnot_si128(VCOH, VCOL); le = _mm_blendv_epi8(VCCL, leeq, le); ge = _mm_or_si128(VCOL, VCOH); ge = _mm_blendv_epi8(geeq, VCCH, ge); mask = _mm_blendv_epi8(ge, le, VCOL); ACCL = _mm_blendv_epi8(vs, nvt, mask); VCCH = ge; VCCL = le; VCOH = zero; VCOL = zero; VCE = zero; vd = ACCL; #endif } } template auto RSP::VCR(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { if ((vs.s16(n) ^ vte.s16(n)) < 0) { VCCH.set(n, vte.s16(n) < 0); ACCL.u16(n) = VCCL.set(n, vs.s16(n) + vte.s16(n) + 1 <= 0) ? ~vte.u16(n) : vs.u16(n); } else { VCCL.set(n, vte.s16(n) < 0); ACCL.u16(n) = VCCH.set(n, vs.s16(n) - vte.s16(n) >= 0) ? vte.u16(n) : vs.u16(n); } } VCOL = zero; VCOH = zero; VCE = zero; vd = ACCL; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), sign, dlez, dgez, nvt, mask; sign = _mm_xor_si128(vs, vte); sign = _mm_srai_epi16(sign, 15); dlez = _mm_and_si128(vs, sign); dlez = _mm_add_epi16(dlez, vte); VCCL = _mm_srai_epi16(dlez, 15); dgez = _mm_or_si128(vs, sign); dgez = _mm_min_epi16(dgez, vte); VCCH = _mm_cmpeq_epi16(dgez, vte); nvt = _mm_xor_si128(vte, sign); mask = _mm_blendv_epi8(VCCH, VCCL, sign); ACCL = _mm_blendv_epi8(vs, nvt, mask); vd = ACCL; VCOL = zero; VCOH = zero; VCE = zero; #endif } } template auto RSP::VEQ(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { ACCL.u16(n) = VCCL.set(n, !VCOH.get(n) && vs.u16(n) == vte.u16(n)) ? vs.u16(n) : vte.u16(n); } VCCH = zero; //unverified VCOL = zero; VCOH = zero; vd = ACCL; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), eq; eq = _mm_cmpeq_epi16(vs, vte); VCCL = _mm_andnot_si128(VCOH, eq); ACCL = _mm_blendv_epi8(vte, vs, VCCL); VCCH = zero; //unverified VCOH = zero; VCOL = zero; vd = ACCL; #endif } } template auto RSP::VGE(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { ACCL.u16(n) = VCCL.set(n, vs.s16(n) > vte.s16(n) || (vs.s16(n) == vte.s16(n) && (!VCOL.get(n) || !VCOH.get(n)))) ? vs.u16(n) : vte.u16(n); } VCCH = zero; //unverified VCOL = zero; VCOH = zero; vd = ACCL; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), eq, gt, es; eq = _mm_cmpeq_epi16(vs, vte); gt = _mm_cmpgt_epi16(vs, vte); es = _mm_and_si128(VCOH, VCOL); eq = _mm_andnot_si128(es, eq); VCCL = _mm_or_si128(gt, eq); ACCL = _mm_blendv_epi8(vte, vs, VCCL); VCCH = zero; VCOH = zero; VCOL = zero; vd = ACCL; #endif } } template auto RSP::VLT(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { ACCL.u16(n) = VCCL.set(n, vs.s16(n) < vte.s16(n) || (vs.s16(n) == vte.s16(n) && VCOL.get(n) && VCOH.get(n))) ? vs.u16(n) : vte.u16(n); } VCCH = zero; VCOL = zero; VCOH = zero; vd = ACCL; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), eq, lt; eq = _mm_cmpeq_epi16(vs, vte); lt = _mm_cmplt_epi16(vs, vte); eq = _mm_and_si128(VCOH, eq); eq = _mm_and_si128(VCOL, eq); VCCL = _mm_or_si128(lt, eq); ACCL = _mm_blendv_epi8(vte, vs, VCCL); VCCH = zero; VCOH = zero; VCOL = zero; vd = ACCL; #endif } } template auto RSP::VMACF(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { accumulatorSet(n, accumulatorGet(n) + (s64)vs.s16(n) * (s64)vte.s16(n) * 2); if constexpr (U == 0) { vd.u16(n) = accumulatorSaturate(n, 1, 0x8000, 0x7fff); } if constexpr (U == 1) { vd.u16(n) = ACCH.s16(n) < 0 ? 0x0000 : ACCH.s16(n) || ACCM.s16(n) < 0 ? 0xffff : ACCM.u16(n); } } } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), lo, md, hi, carry, omask; lo = _mm_mullo_epi16(vs, vte); hi = _mm_mulhi_epi16(vs, vte); md = _mm_slli_epi16(hi, 1); carry = _mm_srli_epi16(lo, 15); hi = _mm_srai_epi16(hi, 15); md = _mm_or_si128(md, carry); lo = _mm_slli_epi16(lo, 1); omask = _mm_adds_epu16(ACCL, lo); ACCL = _mm_add_epi16(ACCL, lo); omask = _mm_cmpeq_epi16(ACCL, omask); omask = _mm_cmpeq_epi16(omask, zero); md = _mm_sub_epi16(md, omask); carry = _mm_cmpeq_epi16(md, zero); carry = _mm_and_si128(carry, omask); hi = _mm_sub_epi16(hi, carry); omask = _mm_adds_epu16(ACCM, md); ACCM = _mm_add_epi16(ACCM, md); omask = _mm_cmpeq_epi16(ACCM, omask); omask = _mm_cmpeq_epi16(omask, zero); ACCH = _mm_add_epi16(ACCH, hi); ACCH = _mm_sub_epi16(ACCH, omask); if constexpr (!U) { lo = _mm_unpacklo_epi16(ACCM, ACCH); hi = _mm_unpackhi_epi16(ACCM, ACCH); vd = _mm_packs_epi32(lo, hi); } else { r128 mmask, hmask; mmask = _mm_srai_epi16(ACCM, 15); hmask = _mm_srai_epi16(ACCH, 15); md = _mm_or_si128(mmask, ACCM); omask = _mm_cmpgt_epi16(ACCH, zero); md = _mm_andnot_si128(hmask, md); vd = _mm_or_si128(omask, md); } #endif } } auto RSP::VMACQ(r128& vd) -> void { for (u32 n = 0; n < 8; n++) { s32 product = ACCH.element(n) << 16 | ACCM.element(n) << 0; if (product < 0 && !(product & 1 << 5)) product += 32; else if (product >= 32 && !(product & 1 << 5)) product -= 32; ACCH.element(n) = product >> 16; ACCM.element(n) = product >> 0; vd.element(n) = sclamp<16>(product >> 1) & ~15; } } template auto RSP::VMADH(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { s32 result = (accumulatorGet(n) >> 16) + vs.s16(n) * vte.s16(n); ACCH.u16(n) = result >> 16; ACCM.u16(n) = result >> 0; vd.u16(n) = accumulatorSaturate(n, 1, 0x8000, 0x7fff); } } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), lo, hi, omask; lo = _mm_mullo_epi16(vs, vte); hi = _mm_mulhi_epi16(vs, vte); omask = _mm_adds_epu16(ACCM, lo); ACCM = _mm_add_epi16(ACCM, lo); omask = _mm_cmpeq_epi16(ACCM, omask); omask = _mm_cmpeq_epi16(omask, zero); hi = _mm_sub_epi16(hi, omask); ACCH = _mm_add_epi16(ACCH, hi); lo = _mm_unpacklo_epi16(ACCM, ACCH); hi = _mm_unpackhi_epi16(ACCM, ACCH); vd = _mm_packs_epi32(lo, hi); #endif } } template auto RSP::VMADL(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { accumulatorSet(n, accumulatorGet(n) + (u32(vs.u16(n) * vte.u16(n)) >> 16)); vd.u16(n) = accumulatorSaturate(n, 0, 0x0000, 0xffff); } } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), hi, omask, nhi, nmd, shi, smd, cmask, cval; hi = _mm_mulhi_epu16(vs, vte); omask = _mm_adds_epu16(ACCL, hi); ACCL = _mm_add_epi16(ACCL, hi); omask = _mm_cmpeq_epi16(ACCL, omask); omask = _mm_cmpeq_epi16(omask, zero); hi = _mm_sub_epi16(zero, omask); omask = _mm_adds_epu16(ACCM, hi); ACCM = _mm_add_epi16(ACCM, hi); omask = _mm_cmpeq_epi16(ACCM, omask); omask = _mm_cmpeq_epi16(omask, zero); ACCH = _mm_sub_epi16(ACCH, omask); nhi = _mm_srai_epi16(ACCH, 15); nmd = _mm_srai_epi16(ACCM, 15); shi = _mm_cmpeq_epi16(nhi, ACCH); smd = _mm_cmpeq_epi16(nhi, nmd); cmask = _mm_and_si128(smd, shi); cval = _mm_cmpeq_epi16(nhi, zero); vd = _mm_blendv_epi8(cval, ACCL, cmask); #endif } } template auto RSP::VMADM(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { accumulatorSet(n, accumulatorGet(n) + vs.s16(n) * vte.u16(n)); vd.u16(n) = accumulatorSaturate(n, 1, 0x8000, 0x7fff); } } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), lo, hi, sign, vta, omask; lo = _mm_mullo_epi16(vs, vte); hi = _mm_mulhi_epu16(vs, vte); sign = _mm_srai_epi16(vs, 15); vta = _mm_and_si128(vte, sign); hi = _mm_sub_epi16(hi, vta); omask = _mm_adds_epu16(ACCL, lo); ACCL = _mm_add_epi16(ACCL, lo); omask = _mm_cmpeq_epi16(ACCL, omask); omask = _mm_cmpeq_epi16(omask, zero); hi = _mm_sub_epi16(hi, omask); omask = _mm_adds_epu16(ACCM, hi); ACCM = _mm_add_epi16(ACCM, hi); omask = _mm_cmpeq_epi16(ACCM, omask); omask = _mm_cmpeq_epi16(omask, zero); hi = _mm_srai_epi16(hi, 15); ACCH = _mm_add_epi16(ACCH, hi); ACCH = _mm_sub_epi16(ACCH, omask); lo = _mm_unpacklo_epi16(ACCM, ACCH); hi = _mm_unpackhi_epi16(ACCM, ACCH); vd = _mm_packs_epi32(lo, hi); #endif } } template auto RSP::VMADN(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { accumulatorSet(n, accumulatorGet(n) + s64(vs.u16(n) * vte.s16(n))); vd.u16(n) = accumulatorSaturate(n, 0, 0x0000, 0xffff); } } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), lo, hi, sign, vsa, omask, nhi, nmd, shi, smd, cmask, cval; lo = _mm_mullo_epi16(vs, vte); hi = _mm_mulhi_epu16(vs, vte); sign = _mm_srai_epi16(vte, 15); vsa = _mm_and_si128(vs, sign); hi = _mm_sub_epi16(hi, vsa); omask = _mm_adds_epu16(ACCL, lo); ACCL = _mm_add_epi16(ACCL, lo); omask = _mm_cmpeq_epi16(ACCL, omask); omask = _mm_cmpeq_epi16(omask, zero); hi = _mm_sub_epi16(hi, omask); omask = _mm_adds_epu16(ACCM, hi); ACCM = _mm_add_epi16(ACCM, hi); omask = _mm_cmpeq_epi16(ACCM, omask); omask = _mm_cmpeq_epi16(omask, zero); hi = _mm_srai_epi16(hi, 15); ACCH = _mm_add_epi16(ACCH, hi); ACCH = _mm_sub_epi16(ACCH, omask); nhi = _mm_srai_epi16(ACCH, 15); nmd = _mm_srai_epi16(ACCM, 15); shi = _mm_cmpeq_epi16(nhi, ACCH); smd = _mm_cmpeq_epi16(nhi, nmd); cmask = _mm_and_si128(smd, shi); cval = _mm_cmpeq_epi16(nhi, zero); vd = _mm_blendv_epi8(cval, ACCL, cmask); #endif } } template auto RSP::VMOV(r128& vd, u8 de, cr128& vt) -> void { cr128 vte = vt(e); vd.u16(de) = vte.u16(de); ACCL = vte; } template auto RSP::VMRG(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { ACCL.u16(n) = VCCL.get(n) ? vs.u16(n) : vte.u16(n); } VCOH = zero; VCOL = zero; vd = ACCL; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 ACCL = _mm_blendv_epi8(vt(e), vs, VCCL); VCOH = zero; VCOL = zero; vd = ACCL; #endif } } template auto RSP::VMUDH(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { accumulatorSet(n, s64(vs.s16(n) * vte.s16(n)) << 16); vd.u16(n) = accumulatorSaturate(n, 1, 0x8000, 0x7fff); } } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), lo, hi; ACCL = zero; ACCM = _mm_mullo_epi16(vs, vte); ACCH = _mm_mulhi_epi16(vs, vte); lo = _mm_unpacklo_epi16(ACCM, ACCH); hi = _mm_unpackhi_epi16(ACCM, ACCH); vd = _mm_packs_epi32(lo, hi); #endif } } template auto RSP::VMUDL(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { accumulatorSet(n, u16(vs.u16(n) * vte.u16(n) >> 16)); } vd = ACCL; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 ACCL = _mm_mulhi_epu16(vs, vt(e)); ACCM = zero; ACCH = zero; vd = ACCL; #endif } } template auto RSP::VMUDM(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { accumulatorSet(n, s32(vs.s16(n) * vte.u16(n))); } vd = ACCM; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), sign, vta; ACCL = _mm_mullo_epi16(vs, vte); ACCM = _mm_mulhi_epu16(vs, vte); sign = _mm_srai_epi16(vs, 15); vta = _mm_and_si128(vte, sign); ACCM = _mm_sub_epi16(ACCM, vta); ACCH = _mm_srai_epi16(ACCM, 15); vd = ACCM; #endif } } template auto RSP::VMUDN(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { accumulatorSet(n, s32(vs.u16(n) * vte.s16(n))); } vd = ACCL; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), sign, vsa; ACCL = _mm_mullo_epi16(vs, vte); ACCM = _mm_mulhi_epu16(vs, vte); sign = _mm_srai_epi16(vte, 15); vsa = _mm_and_si128(vs, sign); ACCM = _mm_sub_epi16(ACCM, vsa); ACCH = _mm_srai_epi16(ACCM, 15); vd = ACCL; #endif } } template auto RSP::VMULF(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { accumulatorSet(n, (s64)vs.s16(n) * (s64)vte.s16(n) * 2 + 0x8000); if constexpr (U == 0) { vd.u16(n) = accumulatorSaturate(n, 1, 0x8000, 0x7fff); } if constexpr (U == 1) { vd.u16(n) = ACCH.s16(n) < 0 ? 0x0000 : (ACCH.s16(n) ^ ACCM.s16(n)) < 0 ? 0xffff : ACCM.u16(n); } } } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), lo, hi, round, sign1, sign2, neq, eq, neg; lo = _mm_mullo_epi16(vs, vte); round = _mm_cmpeq_epi16(zero, zero); sign1 = _mm_srli_epi16(lo, 15); lo = _mm_add_epi16(lo, lo); round = _mm_slli_epi16(round, 15); hi = _mm_mulhi_epi16(vs, vte); sign2 = _mm_srli_epi16(lo, 15); ACCL = _mm_add_epi16(round, lo); sign1 = _mm_add_epi16(sign1, sign2); hi = _mm_slli_epi16(hi, 1); neq = _mm_cmpeq_epi16(vs, vte); ACCM = _mm_add_epi16(hi, sign1); neg = _mm_srai_epi16(ACCM, 15); if constexpr (!U) { eq = _mm_and_si128(neq, neg); ACCH = _mm_andnot_si128(neq, neg); vd = _mm_add_epi16(ACCM, eq); } else { ACCH = _mm_andnot_si128(neq, neg); hi = _mm_or_si128(ACCM, neg); vd = _mm_andnot_si128(ACCH, hi); } #endif } } template auto RSP::VMULQ(r128& vd, cr128& vs, cr128& vt) -> void { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { s32 product = (s16)vs.element(n) * (s16)vte.element(n); if (product < 0) product += 31; //round ACCH.element(n) = product >> 16; ACCM.element(n) = product >> 0; ACCL.element(n) = 0; vd.element(n) = sclamp<16>(product >> 1) & ~15; } } template auto RSP::VNAND(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { ACCL.u16(n) = ~(vs.u16(n) & vte.u16(n)); } vd = ACCL; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 ACCL = _mm_and_si128(vs, vt(e)); ACCL = _mm_xor_si128(ACCL, invert); vd = ACCL; #endif } } template auto RSP::VNE(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { ACCL.u16(n) = VCCL.set(n, vs.u16(n) != vte.u16(n) || VCOH.get(n)) ? vs.u16(n) : vte.u16(n); } VCCH = zero; //unverified VCOL = zero; VCOH = zero; vd = ACCL; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), eq, ne; eq = _mm_cmpeq_epi16(vs, vte); ne = _mm_cmpeq_epi16(eq, zero); VCCL = _mm_and_si128(VCOH, eq); VCCL = _mm_or_si128(VCCL, ne); ACCL = _mm_blendv_epi8(vte, vs, VCCL); VCCH = zero; VCOH = zero; VCOL = zero; vd = ACCL; #endif } } auto RSP::VNOP() -> void { } template auto RSP::VNOR(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { ACCL.u16(n) = ~(vs.u16(n) | vte.u16(n)); } vd = ACCL; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 ACCL = _mm_or_si128(vs, vt(e)); ACCL = _mm_xor_si128(ACCL, invert); vd = ACCL; #endif } } template auto RSP::VNXOR(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { ACCL.u16(n) = ~(vs.u16(n) ^ vte.u16(n)); } vd = ACCL; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 ACCL = _mm_xor_si128(vs, vt(e)); ACCL = _mm_xor_si128(ACCL, invert); vd = ACCL; #endif } } template auto RSP::VOR(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { ACCL.u16(n) = vs.u16(n) | vte.u16(n); } vd = ACCL; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 ACCL = _mm_or_si128(vs, vt(e)); vd = ACCL; #endif } } template auto RSP::VRCP(r128& vd, u8 de, cr128& vt) -> void { s32 result = 0; s32 input = L && DIVDP ? DIVIN << 16 | vt.element(e & 7) : s16(vt.element(e & 7)); s32 mask = input >> 31; s32 data = input ^ mask; if (input > -32768) data -= mask; if (data == 0) { result = 0x7fff'ffff; } else if (input == -32768) { result = 0xffff'0000; } else { u32 shift = __builtin_clz(data); u32 index = (u64(data) << shift & 0x7fc0'0000) >> 22; result = rspReciprocals[index]; result = (0x10000 | result) << 14; result = result >> 31 - shift ^ mask; } DIVDP = 0; DIVOUT = result >> 16; ACCL = vt(e); vd.element(de) = result; } template auto RSP::VRCPH(r128& vd, u8 de, cr128& vt) -> void { ACCL = vt(e); DIVDP = 1; DIVIN = vt.element(e & 7); vd.element(de) = DIVOUT; } template auto RSP::VRND(r128& vd, u8 vs, cr128& vt) -> void { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { s32 product = (s16)vte.element(n); if (vs & 1) product <<= 16; s64 acc = 0; acc |= ACCH.element(n); acc <<= 16; acc |= ACCM.element(n); acc <<= 16; acc |= ACCL.element(n); acc <<= 16; acc >>= 16; if (D == 0 && acc < 0) acc = sclip<48>(acc + product); if (D == 1 && acc >= 0) acc = sclip<48>(acc + product); ACCH.element(n) = acc >> 32; ACCM.element(n) = acc >> 16; ACCL.element(n) = acc >> 0; vd.element(n) = sclamp<16>(acc >> 16); } } template auto RSP::VRSQ(r128& vd, u8 de, cr128& vt) -> void { s32 result = 0; s32 input = L && DIVDP ? DIVIN << 16 | vt.element(e & 7) : s16(vt.element(e & 7)); s32 mask = input >> 31; s32 data = input ^ mask; if (input > -32768) data -= mask; if (data == 0) { result = 0x7fff'ffff; } else if (input == -32768) { result = 0xffff'0000; } else { u32 shift = __builtin_clz(data); u32 index = (u64(data) << shift & 0x7fc0'0000) >> 22; result = rspInverseSquareRoots[index & 0x1fe | shift & 1]; result = (0x10000 | result) << 14; result = result >> (31 - shift >> 1) ^ mask; } DIVDP = 0; DIVOUT = result >> 16; ACCL = vt(e); vd.element(de) = result; } template auto RSP::VRSQH(r128& vd, u8 de, cr128& vt) -> void { ACCL = vt(e); DIVDP = 1; DIVIN = vt.element(e & 7); vd.element(de) = DIVOUT; } template auto RSP::VSAR(r128& vd, cr128& vs) -> void { switch (e) { case 0x8: vd = ACCH; break; case 0x9: vd = ACCM; break; case 0xa: vd = ACCL; break; default: vd = zero; break; } } template auto RSP::VSUB(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { s32 result = vs.s16(n) - vte.s16(n) - VCOL.get(n); ACCL.s16(n) = result; vd.s16(n) = sclamp<16>(result); } VCOL = zero; VCOH = zero; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), udiff, sdiff, ov; udiff = _mm_sub_epi16(vte, VCOL); sdiff = _mm_subs_epi16(vte, VCOL); ACCL = _mm_sub_epi16(vs, udiff); ov = _mm_cmpgt_epi16(sdiff, udiff); vd = _mm_subs_epi16(vs, sdiff); vd = _mm_adds_epi16(vd, ov); VCOL = zero; VCOH = zero; #endif } } template auto RSP::VSUBC(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { u32 result = vs.u16(n) - vte.u16(n); ACCL.u16(n) = result; VCOL.set(n, result >> 16); VCOH.set(n, result != 0); } vd = ACCL; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), equal, udiff, diff0; udiff = _mm_subs_epu16(vs, vte); equal = _mm_cmpeq_epi16(vs, vte); diff0 = _mm_cmpeq_epi16(udiff, zero); VCOH = _mm_cmpeq_epi16(equal, zero); VCOL = _mm_andnot_si128(equal, diff0); ACCL = _mm_sub_epi16(vs, vte); vd = ACCL; #endif } } template auto RSP::VXOR(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { ACCL.u16(n) = vs.u16(n) ^ vte.u16(n); } vd = ACCL; } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 ACCL = _mm_xor_si128(vs, vt(e)); vd = ACCL; #endif } } template auto RSP::VZERO(r128& vd, cr128& vs, cr128& vt) -> void { if constexpr (Accuracy::RSP::SISD) { cr128 vte = vt(e); for (u32 n = 0; n < 8; n++) { s32 result = vs.s16(n) + vte.s16(n); ACCL.s16(n) = result; vd.s16(n) = 0; } } if constexpr (Accuracy::RSP::SIMD) { #if ARCHITECTURE_SUPPORTS_SSE4_1 r128 vte = vt(e), sum, min, max; ACCL = _mm_add_epi16(vs, vte); vd = _mm_xor_si128(vd, vd); #endif } } #undef ACCH #undef ACCM #undef ACCL #undef VCOH #undef VCOL #undef VCCH #undef VCCL #undef VCE #undef DIVIN #undef DIVOUT #undef DIVDP