VertexLoaderARM64: Always use unscaled load/store instructions

The source and destination offsets will always be less than 255, so we can get rid of a lot of the complexity by doing this.
This commit is contained in:
Pokechu22 2022-07-14 20:02:55 -07:00
parent a34d5e5960
commit 9a290c3d50

View File

@ -66,32 +66,12 @@ void VertexLoaderARM64::GetVertexAddr(CPArray array, VertexComponentFormat attri
{
if (attribute == VertexComponentFormat::Index8)
{
if (m_src_ofs < 4096)
{
LDRB(IndexType::Unsigned, scratch1_reg, src_reg, m_src_ofs);
}
else
{
ADD(reg, src_reg, m_src_ofs);
LDRB(IndexType::Unsigned, scratch1_reg, reg, 0);
}
LDURB(scratch1_reg, src_reg, m_src_ofs);
m_src_ofs += 1;
}
else
{
if (m_src_ofs < 256)
else // Index16
{
LDURH(scratch1_reg, src_reg, m_src_ofs);
}
else if (m_src_ofs <= 8190 && !(m_src_ofs & 1))
{
LDRH(IndexType::Unsigned, scratch1_reg, src_reg, m_src_ofs);
}
else
{
ADD(reg, src_reg, m_src_ofs);
LDRH(IndexType::Unsigned, scratch1_reg, reg, 0);
}
m_src_ofs += 2;
REV16(scratch1_reg, scratch1_reg);
}
@ -118,7 +98,7 @@ void VertexLoaderARM64::GetVertexAddr(CPArray array, VertexComponentFormat attri
s32 VertexLoaderARM64::GetAddressImm(CPArray array, VertexComponentFormat attribute,
Arm64Gen::ARM64Reg reg, u32 align)
{
if (IsIndexed(attribute) || (m_src_ofs > 255 && (m_src_ofs & (align - 1))))
if (IsIndexed(attribute))
GetVertexAddr(array, attribute, reg);
else
return m_src_ofs;
@ -145,13 +125,9 @@ int VertexLoaderARM64::ReadVertex(VertexComponentFormat attribute, ComponentForm
else
m_float_emit.LD1(elem_size, 1, coords, EncodeRegTo64(scratch1_reg));
}
else if (offset & (load_size - 1)) // Not aligned - unscaled
{
m_float_emit.LDUR(load_size, coords, src_reg, offset);
}
else
{
m_float_emit.LDR(load_size, IndexType::Unsigned, coords, src_reg, offset);
m_float_emit.LDUR(load_size, coords, src_reg, offset);
}
if (format != ComponentFormat::Float)
@ -191,20 +167,7 @@ int VertexLoaderARM64::ReadVertex(VertexComponentFormat attribute, ComponentForm
}
const u32 write_size = count_out == 3 ? 128 : count_out * 32;
const u32 mask = count_out == 3 ? 0xF : count_out == 2 ? 0x7 : 0x3;
if (m_dst_ofs < 256)
{
m_float_emit.STUR(write_size, coords, dst_reg, m_dst_ofs);
}
else if (!(m_dst_ofs & mask))
{
m_float_emit.STR(write_size, IndexType::Unsigned, coords, dst_reg, m_dst_ofs);
}
else
{
ADD(EncodeRegTo64(scratch2_reg), dst_reg, m_dst_ofs);
m_float_emit.ST1(32, 1, coords, EncodeRegTo64(scratch2_reg));
}
// Z-Freeze
if (native_format == &m_native_vtx_decl.position)
@ -253,10 +216,8 @@ void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat f
case ColorFormat::RGBA8888:
if (offset == -1)
LDR(IndexType::Unsigned, scratch2_reg, EncodeRegTo64(scratch1_reg), 0);
else if (offset & 3) // Not aligned - unscaled
LDUR(scratch2_reg, src_reg, offset);
else
LDR(IndexType::Unsigned, scratch2_reg, src_reg, offset);
LDUR(scratch2_reg, src_reg, offset);
if (format != ColorFormat::RGBA8888)
ORR(scratch2_reg, scratch2_reg, LogicalImm(0xFF000000, 32));
@ -269,10 +230,8 @@ void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat f
// AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
if (offset == -1)
LDRH(IndexType::Unsigned, scratch3_reg, EncodeRegTo64(scratch1_reg), 0);
else if (offset & 1) // Not aligned - unscaled
LDURH(scratch3_reg, src_reg, offset);
else
LDRH(IndexType::Unsigned, scratch3_reg, src_reg, offset);
LDURH(scratch3_reg, src_reg, offset);
REV16(scratch3_reg, scratch3_reg);
@ -306,10 +265,8 @@ void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat f
// AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
if (offset == -1)
LDRH(IndexType::Unsigned, scratch3_reg, EncodeRegTo64(scratch1_reg), 0);
else if (offset & 1) // Not aligned - unscaled
LDURH(scratch3_reg, src_reg, offset);
else
LDRH(IndexType::Unsigned, scratch3_reg, src_reg, offset);
LDURH(scratch3_reg, src_reg, offset);
// R
UBFM(scratch1_reg, scratch3_reg, 4, 7);
@ -337,17 +294,9 @@ void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat f
// RRRRRRGG GGGGBBBB BBAAAAAA
// AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
if (offset == -1)
{
LDUR(scratch3_reg, EncodeRegTo64(scratch1_reg), -1);
}
else
{
offset -= 1;
if (offset & 3) // Not aligned - unscaled
LDUR(scratch3_reg, src_reg, offset);
else
LDR(IndexType::Unsigned, scratch3_reg, src_reg, offset);
}
LDUR(scratch3_reg, src_reg, offset - 1);
REV32(scratch3_reg, scratch3_reg);
@ -385,6 +334,12 @@ void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat f
void VertexLoaderARM64::GenerateVertexLoader()
{
// The largest input vertex (with the position matrix index and all texture matrix indices
// enabled, and all components set as direct) is 129 bytes (corresponding to a 156-byte
// output). This is small enough that we can always use the unscaled load/store instructions
// (which allow an offset from -256 to +255).
ASSERT(m_vertex_size <= 255);
// R0 - Source pointer
// R1 - Destination pointer
// R2 - Count
@ -568,22 +523,7 @@ void VertexLoaderARM64::GenerateVertexLoader()
{
m_native_vtx_decl.texcoords[i].offset = m_dst_ofs;
if (m_dst_ofs < 256)
{
STUR(ARM64Reg::SP, dst_reg, m_dst_ofs);
}
else if (!(m_dst_ofs & 7))
{
// If m_dst_ofs isn't 8byte aligned we can't store an 8byte zero register
// So store two 4byte zero registers
// The destination is always 4byte aligned
STR(IndexType::Unsigned, ARM64Reg::WSP, dst_reg, m_dst_ofs);
STR(IndexType::Unsigned, ARM64Reg::WSP, dst_reg, m_dst_ofs + 4);
}
else
{
STR(IndexType::Unsigned, ARM64Reg::SP, dst_reg, m_dst_ofs);
}
m_float_emit.STR(32, IndexType::Unsigned, ARM64Reg::D31, dst_reg, m_dst_ofs + 8);
m_dst_ofs += sizeof(float) * 3;