dolphin/Source/Core/VideoCommon/VertexLoaderARM64.cpp
Skyler Saleh 4ecb3084b7 Apple M1 Support for MacOS
This commit adds support for compiling Dolphin for ARM on MacOS so that it can
run natively on the M1 processors without running through Rosseta2 emulation
providing a 30-50% performance speedup and less hitches from Rosseta2.

It consists of several key changes:

- Adding support for W^X allocation(MAP_JIT) for the ARM JIT
- Adding the machine context and config info to identify the M1 processor
- Additions to the build system and docs to support building universal binaries
- Adding code signing entitlements to access the MAP_JIT functionality
- Updating the MoltenVK libvulkan.dylib to a newer version with M1 support
2021-05-22 15:25:17 -07:00

615 lines
21 KiB
C++

// Copyright 2015 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.
#include "VideoCommon/VertexLoaderARM64.h"
#include <array>
#include "Common/CommonTypes.h"
#include "VideoCommon/DataReader.h"
#include "VideoCommon/VertexLoaderManager.h"
using namespace Arm64Gen;
constexpr ARM64Reg src_reg = ARM64Reg::X0;
constexpr ARM64Reg dst_reg = ARM64Reg::X1;
constexpr ARM64Reg count_reg = ARM64Reg::W2;
constexpr ARM64Reg skipped_reg = ARM64Reg::W17;
constexpr ARM64Reg scratch1_reg = ARM64Reg::W16;
constexpr ARM64Reg scratch2_reg = ARM64Reg::W15;
constexpr ARM64Reg scratch3_reg = ARM64Reg::W14;
constexpr ARM64Reg saved_count = ARM64Reg::W12;
constexpr ARM64Reg stride_reg = ARM64Reg::X11;
constexpr ARM64Reg arraybase_reg = ARM64Reg::X10;
constexpr ARM64Reg scale_reg = ARM64Reg::X9;
static constexpr int GetLoadSize(int load_bytes)
{
if (load_bytes == 1)
return 1;
else if (load_bytes <= 2)
return 2;
else if (load_bytes <= 4)
return 4;
else if (load_bytes <= 8)
return 8;
else
return 16;
}
alignas(16) static const float scale_factors[] = {
1.0 / (1ULL << 0), 1.0 / (1ULL << 1), 1.0 / (1ULL << 2), 1.0 / (1ULL << 3),
1.0 / (1ULL << 4), 1.0 / (1ULL << 5), 1.0 / (1ULL << 6), 1.0 / (1ULL << 7),
1.0 / (1ULL << 8), 1.0 / (1ULL << 9), 1.0 / (1ULL << 10), 1.0 / (1ULL << 11),
1.0 / (1ULL << 12), 1.0 / (1ULL << 13), 1.0 / (1ULL << 14), 1.0 / (1ULL << 15),
1.0 / (1ULL << 16), 1.0 / (1ULL << 17), 1.0 / (1ULL << 18), 1.0 / (1ULL << 19),
1.0 / (1ULL << 20), 1.0 / (1ULL << 21), 1.0 / (1ULL << 22), 1.0 / (1ULL << 23),
1.0 / (1ULL << 24), 1.0 / (1ULL << 25), 1.0 / (1ULL << 26), 1.0 / (1ULL << 27),
1.0 / (1ULL << 28), 1.0 / (1ULL << 29), 1.0 / (1ULL << 30), 1.0 / (1ULL << 31),
};
VertexLoaderARM64::VertexLoaderARM64(const TVtxDesc& vtx_desc, const VAT& vtx_att)
: VertexLoaderBase(vtx_desc, vtx_att), m_float_emit(this)
{
AllocCodeSpace(4096);
Common::JITPageWriteEnableExecuteDisable();
ClearCodeSpace();
GenerateVertexLoader();
Common::JITPageWriteDisableExecuteEnable();
WriteProtect();
}
void VertexLoaderARM64::GetVertexAddr(int array, VertexComponentFormat attribute, ARM64Reg reg)
{
if (IsIndexed(attribute))
{
if (attribute == VertexComponentFormat::Index8)
{
if (m_src_ofs < 4096)
{
LDRB(IndexType::Unsigned, scratch1_reg, src_reg, m_src_ofs);
}
else
{
ADD(reg, src_reg, m_src_ofs);
LDRB(IndexType::Unsigned, scratch1_reg, reg, 0);
}
m_src_ofs += 1;
}
else
{
if (m_src_ofs < 256)
{
LDURH(scratch1_reg, src_reg, m_src_ofs);
}
else if (m_src_ofs <= 8190 && !(m_src_ofs & 1))
{
LDRH(IndexType::Unsigned, scratch1_reg, src_reg, m_src_ofs);
}
else
{
ADD(reg, src_reg, m_src_ofs);
LDRH(IndexType::Unsigned, scratch1_reg, reg, 0);
}
m_src_ofs += 2;
REV16(scratch1_reg, scratch1_reg);
}
if (array == ARRAY_POSITION)
{
EOR(scratch2_reg, scratch1_reg, 0,
attribute == VertexComponentFormat::Index8 ? 7 : 15); // 0xFF : 0xFFFF
m_skip_vertex = CBZ(scratch2_reg);
}
LDR(IndexType::Unsigned, scratch2_reg, stride_reg, array * 4);
MUL(scratch1_reg, scratch1_reg, scratch2_reg);
LDR(IndexType::Unsigned, EncodeRegTo64(scratch2_reg), arraybase_reg, array * 8);
ADD(EncodeRegTo64(reg), EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch2_reg));
}
else
ADD(reg, src_reg, m_src_ofs);
}
s32 VertexLoaderARM64::GetAddressImm(int array, VertexComponentFormat attribute,
Arm64Gen::ARM64Reg reg, u32 align)
{
if (IsIndexed(attribute) || (m_src_ofs > 255 && (m_src_ofs & (align - 1))))
GetVertexAddr(array, attribute, reg);
else
return m_src_ofs;
return -1;
}
int VertexLoaderARM64::ReadVertex(VertexComponentFormat attribute, ComponentFormat format,
int count_in, int count_out, bool dequantize, u8 scaling_exponent,
AttributeFormat* native_format, s32 offset)
{
ARM64Reg coords = count_in == 3 ? ARM64Reg::Q31 : ARM64Reg::D31;
ARM64Reg scale = count_in == 3 ? ARM64Reg::Q30 : ARM64Reg::D30;
int elem_size = GetElementSize(format);
int load_bytes = elem_size * count_in;
int load_size = GetLoadSize(load_bytes);
load_size <<= 3;
elem_size <<= 3;
if (offset == -1)
{
if (count_in == 1)
m_float_emit.LDR(elem_size, IndexType::Unsigned, coords, EncodeRegTo64(scratch1_reg), 0);
else
m_float_emit.LD1(elem_size, 1, coords, EncodeRegTo64(scratch1_reg));
}
else if (offset & (load_size - 1)) // Not aligned - unscaled
{
m_float_emit.LDUR(load_size, coords, src_reg, offset);
}
else
{
m_float_emit.LDR(load_size, IndexType::Unsigned, coords, src_reg, offset);
}
if (format != ComponentFormat::Float)
{
// Extend and convert to float
switch (format)
{
case ComponentFormat::UByte:
m_float_emit.UXTL(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
m_float_emit.UXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
break;
case ComponentFormat::Byte:
m_float_emit.SXTL(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
m_float_emit.SXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
break;
case ComponentFormat::UShort:
m_float_emit.REV16(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
m_float_emit.UXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
break;
case ComponentFormat::Short:
m_float_emit.REV16(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
m_float_emit.SXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
break;
}
m_float_emit.SCVTF(32, coords, coords);
if (dequantize && scaling_exponent)
{
m_float_emit.LDR(32, IndexType::Unsigned, scale, scale_reg, scaling_exponent * 4);
m_float_emit.FMUL(32, coords, coords, scale, 0);
}
}
else
{
m_float_emit.REV32(8, coords, coords);
}
const u32 write_size = count_out == 3 ? 128 : count_out * 32;
const u32 mask = count_out == 3 ? 0xF : count_out == 2 ? 0x7 : 0x3;
if (m_dst_ofs < 256)
{
m_float_emit.STUR(write_size, coords, dst_reg, m_dst_ofs);
}
else if (!(m_dst_ofs & mask))
{
m_float_emit.STR(write_size, IndexType::Unsigned, coords, dst_reg, m_dst_ofs);
}
else
{
ADD(EncodeRegTo64(scratch2_reg), dst_reg, m_dst_ofs);
m_float_emit.ST1(32, 1, coords, EncodeRegTo64(scratch2_reg));
}
// Z-Freeze
if (native_format == &m_native_vtx_decl.position)
{
CMP(count_reg, 3);
FixupBranch dont_store = B(CC_GT);
MOVP2R(EncodeRegTo64(scratch2_reg), VertexLoaderManager::position_cache);
ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch2_reg), EncodeRegTo64(count_reg),
ArithOption(EncodeRegTo64(count_reg), ShiftType::LSL, 4));
m_float_emit.STUR(write_size, coords, EncodeRegTo64(scratch1_reg), -16);
SetJumpTarget(dont_store);
}
native_format->components = count_out;
native_format->enable = true;
native_format->offset = m_dst_ofs;
native_format->type = VAR_FLOAT;
native_format->integer = false;
m_dst_ofs += sizeof(float) * count_out;
if (attribute == VertexComponentFormat::Direct)
m_src_ofs += load_bytes;
return load_bytes;
}
void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat format, s32 offset)
{
int load_bytes = 0;
switch (format)
{
case ColorFormat::RGB888:
case ColorFormat::RGB888x:
case ColorFormat::RGBA8888:
if (offset == -1)
LDR(IndexType::Unsigned, scratch2_reg, EncodeRegTo64(scratch1_reg), 0);
else if (offset & 3) // Not aligned - unscaled
LDUR(scratch2_reg, src_reg, offset);
else
LDR(IndexType::Unsigned, scratch2_reg, src_reg, offset);
if (format != ColorFormat::RGBA8888)
ORRI2R(scratch2_reg, scratch2_reg, 0xFF000000);
STR(IndexType::Unsigned, scratch2_reg, dst_reg, m_dst_ofs);
load_bytes = format == ColorFormat::RGB888 ? 3 : 4;
break;
case ColorFormat::RGB565:
// RRRRRGGG GGGBBBBB
// AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
if (offset == -1)
LDRH(IndexType::Unsigned, scratch3_reg, EncodeRegTo64(scratch1_reg), 0);
else if (offset & 1) // Not aligned - unscaled
LDURH(scratch3_reg, src_reg, offset);
else
LDRH(IndexType::Unsigned, scratch3_reg, src_reg, offset);
REV16(scratch3_reg, scratch3_reg);
// B
AND(scratch2_reg, scratch3_reg, 32, 4);
ORR(scratch2_reg, ARM64Reg::WSP, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 3));
ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSR, 5));
ORR(scratch1_reg, ARM64Reg::WSP, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 16));
// G
UBFM(scratch2_reg, scratch3_reg, 5, 10);
ORR(scratch2_reg, ARM64Reg::WSP, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 2));
ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSR, 6));
ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 8));
// R
UBFM(scratch2_reg, scratch3_reg, 11, 15);
ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 3));
ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSR, 2));
// A
ORRI2R(scratch1_reg, scratch1_reg, 0xFF000000);
STR(IndexType::Unsigned, scratch1_reg, dst_reg, m_dst_ofs);
load_bytes = 2;
break;
case ColorFormat::RGBA4444:
// BBBBAAAA RRRRGGGG
// REV16 - RRRRGGGG BBBBAAAA
// AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
if (offset == -1)
LDRH(IndexType::Unsigned, scratch3_reg, EncodeRegTo64(scratch1_reg), 0);
else if (offset & 1) // Not aligned - unscaled
LDURH(scratch3_reg, src_reg, offset);
else
LDRH(IndexType::Unsigned, scratch3_reg, src_reg, offset);
// R
UBFM(scratch1_reg, scratch3_reg, 4, 7);
// G
AND(scratch2_reg, scratch3_reg, 32, 3);
ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 8));
// B
UBFM(scratch2_reg, scratch3_reg, 12, 15);
ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 16));
// A
UBFM(scratch2_reg, scratch3_reg, 8, 11);
ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 24));
// Final duplication
ORR(scratch1_reg, scratch1_reg, scratch1_reg, ArithOption(scratch1_reg, ShiftType::LSL, 4));
STR(IndexType::Unsigned, scratch1_reg, dst_reg, m_dst_ofs);
load_bytes = 2;
break;
case ColorFormat::RGBA6666:
// RRRRRRGG GGGGBBBB BBAAAAAA
// AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
if (offset == -1)
{
LDUR(scratch3_reg, EncodeRegTo64(scratch1_reg), -1);
}
else
{
offset -= 1;
if (offset & 3) // Not aligned - unscaled
LDUR(scratch3_reg, src_reg, offset);
else
LDR(IndexType::Unsigned, scratch3_reg, src_reg, offset);
}
REV32(scratch3_reg, scratch3_reg);
// A
UBFM(scratch2_reg, scratch3_reg, 0, 5);
ORR(scratch2_reg, ARM64Reg::WSP, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 2));
ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSR, 6));
ORR(scratch1_reg, ARM64Reg::WSP, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 24));
// B
UBFM(scratch2_reg, scratch3_reg, 6, 11);
ORR(scratch2_reg, ARM64Reg::WSP, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 2));
ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSR, 6));
ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 16));
// G
UBFM(scratch2_reg, scratch3_reg, 12, 17);
ORR(scratch2_reg, ARM64Reg::WSP, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 2));
ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSR, 6));
ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 8));
// R
UBFM(scratch2_reg, scratch3_reg, 18, 23);
ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 2));
ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSR, 4));
STR(IndexType::Unsigned, scratch1_reg, dst_reg, m_dst_ofs);
load_bytes = 3;
break;
}
if (attribute == VertexComponentFormat::Direct)
m_src_ofs += load_bytes;
}
void VertexLoaderARM64::GenerateVertexLoader()
{
// R0 - Source pointer
// R1 - Destination pointer
// R2 - Count
// R30 - LR
//
// R0 return how many
//
// Registers we don't have to worry about saving
// R9-R17 are caller saved temporaries
// R18 is a temporary or platform specific register(iOS)
//
// VFP registers
// We can touch all except v8-v15
// If we need to use those, we need to retain the lower 64bits(!) of the register
bool has_tc = false;
bool has_tc_scale = false;
for (size_t i = 0; i < m_VtxDesc.high.TexCoord.Size(); i++)
{
has_tc |= m_VtxDesc.high.TexCoord[i] != VertexComponentFormat::NotPresent;
has_tc_scale |= (m_VtxAttr.GetTexFrac(i) != 0);
}
bool need_scale = (m_VtxAttr.g0.ByteDequant && m_VtxAttr.g0.PosFrac) ||
(has_tc && has_tc_scale) ||
(m_VtxDesc.low.Normal != VertexComponentFormat::NotPresent);
AlignCode16();
if (IsIndexed(m_VtxDesc.low.Position))
MOV(skipped_reg, ARM64Reg::WZR);
MOV(saved_count, count_reg);
MOVP2R(stride_reg, g_main_cp_state.array_strides);
MOVP2R(arraybase_reg, VertexLoaderManager::cached_arraybases);
if (need_scale)
MOVP2R(scale_reg, scale_factors);
const u8* loop_start = GetCodePtr();
if (m_VtxDesc.low.PosMatIdx)
{
LDRB(IndexType::Unsigned, scratch1_reg, src_reg, m_src_ofs);
AND(scratch1_reg, scratch1_reg, 0, 5);
STR(IndexType::Unsigned, scratch1_reg, dst_reg, m_dst_ofs);
// Z-Freeze
CMP(count_reg, 3);
FixupBranch dont_store = B(CC_GT);
MOVP2R(EncodeRegTo64(scratch2_reg), VertexLoaderManager::position_matrix_index);
STR(IndexType::Unsigned, scratch1_reg, EncodeRegTo64(scratch2_reg), 0);
SetJumpTarget(dont_store);
m_native_vtx_decl.posmtx.components = 4;
m_native_vtx_decl.posmtx.enable = true;
m_native_vtx_decl.posmtx.offset = m_dst_ofs;
m_native_vtx_decl.posmtx.type = VAR_UNSIGNED_BYTE;
m_native_vtx_decl.posmtx.integer = true;
m_src_ofs += sizeof(u8);
m_dst_ofs += sizeof(u32);
}
std::array<u32, 8> texmatidx_ofs;
for (size_t i = 0; i < m_VtxDesc.low.TexMatIdx.Size(); i++)
{
if (m_VtxDesc.low.TexMatIdx[i])
texmatidx_ofs[i] = m_src_ofs++;
}
// Position
{
int elem_size = GetElementSize(m_VtxAttr.g0.PosFormat);
int pos_elements = m_VtxAttr.g0.PosElements == CoordComponentCount::XY ? 2 : 3;
int load_bytes = elem_size * pos_elements;
int load_size = GetLoadSize(load_bytes);
load_size <<= 3;
s32 offset = GetAddressImm(ARRAY_POSITION, m_VtxDesc.low.Position, EncodeRegTo64(scratch1_reg),
load_size);
ReadVertex(m_VtxDesc.low.Position, m_VtxAttr.g0.PosFormat, pos_elements, pos_elements,
m_VtxAttr.g0.ByteDequant, m_VtxAttr.g0.PosFrac, &m_native_vtx_decl.position, offset);
}
if (m_VtxDesc.low.Normal != VertexComponentFormat::NotPresent)
{
static const u8 map[8] = {7, 6, 15, 14};
const u8 scaling_exponent = map[u32(m_VtxAttr.g0.NormalFormat.Value())];
const int limit = m_VtxAttr.g0.NormalElements == NormalComponentCount::NBT ? 3 : 1;
s32 offset = -1;
for (int i = 0; i < (m_VtxAttr.g0.NormalElements == NormalComponentCount::NBT ? 3 : 1); i++)
{
if (!i || m_VtxAttr.g0.NormalIndex3)
{
int elem_size = GetElementSize(m_VtxAttr.g0.NormalFormat);
int load_bytes = elem_size * 3;
int load_size = GetLoadSize(load_bytes);
offset = GetAddressImm(ARRAY_NORMAL, m_VtxDesc.low.Normal, EncodeRegTo64(scratch1_reg),
load_size << 3);
if (offset == -1)
ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), i * elem_size * 3);
else
offset += i * elem_size * 3;
}
int bytes_read = ReadVertex(m_VtxDesc.low.Normal, m_VtxAttr.g0.NormalFormat, 3, 3, true,
scaling_exponent, &m_native_vtx_decl.normals[i], offset);
if (offset == -1)
ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), bytes_read);
else
offset += bytes_read;
}
}
for (size_t i = 0; i < m_VtxDesc.low.Color.Size(); i++)
{
m_native_vtx_decl.colors[i].components = 4;
m_native_vtx_decl.colors[i].type = VAR_UNSIGNED_BYTE;
m_native_vtx_decl.colors[i].integer = false;
if (m_VtxDesc.low.Color[i] != VertexComponentFormat::NotPresent)
{
u32 align = 4;
if (m_VtxAttr.GetColorFormat(i) == ColorFormat::RGB565 ||
m_VtxAttr.GetColorFormat(i) == ColorFormat::RGBA4444)
align = 2;
s32 offset = GetAddressImm(ARRAY_COLOR0 + int(i), m_VtxDesc.low.Color[i],
EncodeRegTo64(scratch1_reg), align);
ReadColor(m_VtxDesc.low.Color[i], m_VtxAttr.GetColorFormat(i), offset);
m_native_vtx_decl.colors[i].components = 4;
m_native_vtx_decl.colors[i].enable = true;
m_native_vtx_decl.colors[i].offset = m_dst_ofs;
m_native_vtx_decl.colors[i].type = VAR_UNSIGNED_BYTE;
m_native_vtx_decl.colors[i].integer = false;
m_dst_ofs += 4;
}
}
for (size_t i = 0; i < m_VtxDesc.high.TexCoord.Size(); i++)
{
m_native_vtx_decl.texcoords[i].offset = m_dst_ofs;
m_native_vtx_decl.texcoords[i].type = VAR_FLOAT;
m_native_vtx_decl.texcoords[i].integer = false;
int elements = m_VtxAttr.GetTexElements(i) == TexComponentCount::S ? 1 : 2;
if (m_VtxDesc.high.TexCoord[i] != VertexComponentFormat::NotPresent)
{
int elem_size = GetElementSize(m_VtxAttr.GetTexFormat(i));
int load_bytes = elem_size * (elements + 2);
int load_size = GetLoadSize(load_bytes);
load_size <<= 3;
s32 offset = GetAddressImm(ARRAY_TEXCOORD0 + int(i), m_VtxDesc.high.TexCoord[i],
EncodeRegTo64(scratch1_reg), load_size);
u8 scaling_exponent = m_VtxAttr.GetTexFrac(i);
ReadVertex(m_VtxDesc.high.TexCoord[i], m_VtxAttr.GetTexFormat(i), elements,
m_VtxDesc.low.TexMatIdx[i] ? 2 : elements, m_VtxAttr.g0.ByteDequant,
scaling_exponent, &m_native_vtx_decl.texcoords[i], offset);
}
if (m_VtxDesc.low.TexMatIdx[i])
{
m_native_vtx_decl.texcoords[i].components = 3;
m_native_vtx_decl.texcoords[i].enable = true;
m_native_vtx_decl.texcoords[i].type = VAR_FLOAT;
m_native_vtx_decl.texcoords[i].integer = false;
LDRB(IndexType::Unsigned, scratch2_reg, src_reg, texmatidx_ofs[i]);
m_float_emit.UCVTF(ARM64Reg::S31, scratch2_reg);
if (m_VtxDesc.high.TexCoord[i] != VertexComponentFormat::NotPresent)
{
m_float_emit.STR(32, IndexType::Unsigned, ARM64Reg::D31, dst_reg, m_dst_ofs);
m_dst_ofs += sizeof(float);
}
else
{
m_native_vtx_decl.texcoords[i].offset = m_dst_ofs;
if (m_dst_ofs < 256)
{
STUR(ARM64Reg::SP, dst_reg, m_dst_ofs);
}
else if (!(m_dst_ofs & 7))
{
// If m_dst_ofs isn't 8byte aligned we can't store an 8byte zero register
// So store two 4byte zero registers
// The destination is always 4byte aligned
STR(IndexType::Unsigned, ARM64Reg::WSP, dst_reg, m_dst_ofs);
STR(IndexType::Unsigned, ARM64Reg::WSP, dst_reg, m_dst_ofs + 4);
}
else
{
STR(IndexType::Unsigned, ARM64Reg::SP, dst_reg, m_dst_ofs);
}
m_float_emit.STR(32, IndexType::Unsigned, ARM64Reg::D31, dst_reg, m_dst_ofs + 8);
m_dst_ofs += sizeof(float) * 3;
}
}
}
// Prepare for the next vertex.
ADD(dst_reg, dst_reg, m_dst_ofs);
const u8* cont = GetCodePtr();
ADD(src_reg, src_reg, m_src_ofs);
SUB(count_reg, count_reg, 1);
CBNZ(count_reg, loop_start);
if (IsIndexed(m_VtxDesc.low.Position))
{
SUB(ARM64Reg::W0, saved_count, skipped_reg);
RET(ARM64Reg::X30);
SetJumpTarget(m_skip_vertex);
ADD(skipped_reg, skipped_reg, 1);
B(cont);
}
else
{
MOV(ARM64Reg::W0, saved_count);
RET(ARM64Reg::X30);
}
FlushIcache();
ASSERT(m_vertex_size == m_src_ofs);
m_native_vtx_decl.stride = m_dst_ofs;
}
int VertexLoaderARM64::RunVertices(DataReader src, DataReader dst, int count)
{
m_numLoadedVertices += count;
return ((int (*)(u8 * src, u8 * dst, int count)) region)(src.GetPointer(), dst.GetPointer(),
count);
}