From 69597166f332579d88b76c4b452387051ac1de18 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 26 Jul 2024 11:42:18 +0200 Subject: [PATCH] start the shader decompiler --- src/Cafe/CMakeLists.txt | 3 + .../LatteDecompiler.cpp | 22 +- .../LatteDecompilerEmitMSL.cpp | 4127 +++++++++++++++++ .../LatteDecompilerEmitMSLAttrDecoder.cpp | 508 ++ .../LatteDecompilerEmitMSLHeader.hpp | 426 ++ .../LatteDecompilerInternal.h | 7 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 3 + 7 files changed, 5088 insertions(+), 8 deletions(-) create mode 100644 src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp create mode 100644 src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp create mode 100644 src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index d4446652..fa3c6ff9 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -151,6 +151,9 @@ add_library(CemuCafe HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitGLSLAttrDecoder.cpp HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitGLSL.cpp HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitGLSLHeader.hpp + HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp + HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp + HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h HW/Latte/LegacyShaderDecompiler/LatteDecompilerInstructions.h HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp index c3f7c19e..5f0d7fb2 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp @@ -12,6 +12,8 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h" #include "util/helpers/helpers.h" +// TODO: remove this include +#include "util/helpers/StringBuf.h" // parse instruction and if valid append it to instructionList bool LatteDecompiler_ParseCFInstruction(LatteDecompilerShaderContext* shaderContext, uint32 cfIndex, uint32 cfWord0, uint32 cfWord1, bool* endOfProgram, std::vector& instructionList) @@ -323,8 +325,8 @@ bool LatteDecompiler_IsALUTransInstruction(bool isOP3, uint32 opcode) } else if( opcode == ALU_OP2_INST_MOV || opcode == ALU_OP2_INST_ADD || - opcode == ALU_OP2_INST_NOP || - opcode == ALU_OP2_INST_MUL || + opcode == ALU_OP2_INST_NOP || + opcode == ALU_OP2_INST_MUL || opcode == ALU_OP2_INST_DOT4 || opcode == ALU_OP2_INST_DOT4_IEEE || opcode == ALU_OP2_INST_MAX || // Not sure if MIN/MAX are non-transcendental? @@ -927,7 +929,7 @@ void LatteDecompiler_ParseTEXClause(LatteDecompilerShader* shaderContext, LatteD texInstruction.memRead.format = dataFormat; texInstruction.memRead.nfa = nfa; texInstruction.memRead.isSigned = isSigned; - + cfInstruction->instructionsTEX.emplace_back(texInstruction); } else @@ -1066,9 +1068,19 @@ void _LatteDecompiler_Process(LatteDecompilerShaderContext* shaderContext, uint8 LatteDecompiler_analyzeDataTypes(shaderContext); // emit code if (shaderContext->shader->hasError == false) - LatteDecompiler_emitGLSLShader(shaderContext, shaderContext->shader); + { + if (g_renderer->GetType() == RendererAPI::Metal) + { + LatteDecompiler_emitMSLShader(shaderContext, shaderContext->shader); + // HACK + std::cout << shaderContext->shaderSource->c_str() << std::endl; + } else + { + LatteDecompiler_emitGLSLShader(shaderContext, shaderContext->shader); + } + } LatteDecompiler_cleanup(shaderContext); - // fast access + // fast access _LatteDecompiler_GenerateDataForFastAccess(shaderContext->shader); } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp new file mode 100644 index 00000000..943f1840 --- /dev/null +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -0,0 +1,4127 @@ +#include "Cafe/HW/Latte/Core/LatteConst.h" +#include "Cafe/HW/Latte/Core/LatteShaderAssembly.h" +#include "Cafe/HW/Latte/ISA/RegDefines.h" +#include "Cafe/OS/libs/gx2/GX2.h" // todo - remove dependency +#include "Cafe/HW/Latte/Core/Latte.h" +#include "Cafe/HW/Latte/Core/LatteDraw.h" +#include "Cafe/HW/Latte/Core/LatteShader.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInstructions.h" +#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "config/ActiveSettings.h" +#include "util/helpers/StringBuf.h" + +#include +#include + +#define _CRLF "\r\n" + +void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext, StringBuf* src, LatteParsedFetchShaderAttribute_t* attrib); + +/* + * Variable names: + * R0-R127 temp + * Most variables are multi-typed and the respective type is appended to the name + * Type suffixes are: f (float), i (32bit int), ui (unsigned 32bit int) + * Examples: R13ui.x, tempf.z + */ + +// local prototypes +void _emitTypeConversionPrefixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType); +void _emitTypeConversionSuffixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType); +void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, bool isSubroutine); + +static const char* _getElementStrByIndex(uint32 channel) +{ + switch (channel) + { + case 0: + return "x"; + case 1: + return "y"; + case 2: + return "z"; + case 3: + return "w"; + } + return "UNDEFINED"; +} + +static char _tempGenString[64][256]; +static uint32 _tempGenStringIndex = 0; + +static char* _getTempString() +{ + char* str = _tempGenString[_tempGenStringIndex]; + _tempGenStringIndex = (_tempGenStringIndex+1)%64; + return str; +} + +static char* _getActiveMaskVarName(LatteDecompilerShaderContext* shaderContext, sint32 index) +{ + char* varName = _getTempString(); + if (shaderContext->isSubroutine) + sprintf(varName, "activeMaskStackSub%04x[%d]", shaderContext->subroutineInfo->cfAddr, index); + else + sprintf(varName, "activeMaskStack[%d]", index); + return varName; +} + +static char* _getActiveMaskCVarName(LatteDecompilerShaderContext* shaderContext, sint32 index) +{ + char* varName = _getTempString(); + if (shaderContext->isSubroutine) + sprintf(varName, "activeMaskStackCSub%04x[%d]", shaderContext->subroutineInfo->cfAddr, index); + else + sprintf(varName, "activeMaskStackC[%d]", index); + return varName; +} + +static char* _getRegisterVarName(LatteDecompilerShaderContext* shaderContext, uint32 index, sint32 destRelIndexMode=-1) +{ + auto type = shaderContext->typeTracker.defaultDataType; + char* tempStr = _getTempString(); + if (shaderContext->typeTracker.useArrayGPRs == false) + { + if (type == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + sprintf(tempStr, "R%di", index); + else if (type == LATTE_DECOMPILER_DTYPE_FLOAT) + sprintf(tempStr, "R%df", index); + } + else + { + char destRelOffset[32]; + if (destRelIndexMode >= 0) + { + if (destRelIndexMode == GPU7_INDEX_AR_X) + strcpy(destRelOffset, "ARi.x"); + else if (destRelIndexMode == GPU7_INDEX_AR_Y) + strcpy(destRelOffset, "ARi.y"); + else if (destRelIndexMode == GPU7_INDEX_AR_Z) + strcpy(destRelOffset, "ARi.z"); + else if (destRelIndexMode == GPU7_INDEX_AR_W) + strcpy(destRelOffset, "ARi.w"); + else + debugBreakpoint(); + if (type == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + sprintf(tempStr, "Ri[%d+%s]", index, destRelOffset); + } + else if (type == LATTE_DECOMPILER_DTYPE_FLOAT) + { + sprintf(tempStr, "Rf[%d+%s]", index, destRelOffset); + } + } + else + { + if (type == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + sprintf(tempStr, "Ri[%d]", index); + } + else if (type == LATTE_DECOMPILER_DTYPE_FLOAT) + { + sprintf(tempStr, "Rf[%d]", index); + } + } + } + return tempStr; +} + +static void _appendRegisterTypeSuffix(StringBuf* src, sint32 dataType) +{ + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->add("i"); + else if (dataType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) + src->add("ui"); + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->add("f"); + else + cemu_assert_unimplemented(); +} + +// appends x/y/z/w +static void _appendChannel(StringBuf* src, sint32 channelIndex) +{ + cemu_assert_debug(channelIndex >= 0 && channelIndex <= 3); + switch (channelIndex) + { + case 0: + src->add("x"); + return; + case 1: + src->add("y"); + return; + case 2: + src->add("z"); + return; + case 3: + src->add("w"); + return; + } +} + +// appends .x/.y/.z/.w +static void _appendChannelAccess(StringBuf* src, sint32 channelIndex) +{ + cemu_assert_debug(channelIndex >= 0 && channelIndex <= 3); + switch (channelIndex) + { + case 0: + src->add(".x"); + return; + case 1: + src->add(".y"); + return; + case 2: + src->add(".z"); + return; + case 3: + src->add(".w"); + return; + } +} + +static void _appendPVPS(LatteDecompilerShaderContext* shaderContext, StringBuf* src, uint32 groupIndex, uint8 aluUnit) +{ + cemu_assert_debug(aluUnit < 5); + if (aluUnit == 4) + { + src->addFmt("PS{}", (groupIndex & 1)); + _appendRegisterTypeSuffix(src, shaderContext->typeTracker.defaultDataType); + return; + } + src->addFmt("PV{}", (groupIndex & 1)); + _appendRegisterTypeSuffix(src, shaderContext->typeTracker.defaultDataType); + _appendChannel(src, aluUnit); +} + +std::string _FormatFloatAsConstant(float f) +{ + char floatAsStr[64]; + size_t floatAsStrLen = fmt::format_to_n(floatAsStr, 64, "{:#}", f).size; + size_t floatAsStrLenOrg = floatAsStrLen; + if(floatAsStrLen > 0 && floatAsStr[floatAsStrLen-1] == '.') + { + floatAsStr[floatAsStrLen] = '0'; + floatAsStrLen++; + } + cemu_assert(floatAsStrLen < 50); // constant suspiciously long? + floatAsStr[floatAsStrLen] = '\0'; + cemu_assert_debug(floatAsStrLen >= 3); // shortest possible form is "0.0" + return floatAsStr; +} + +// tracks PV/PS and register backups +struct ALUClauseTemporariesState +{ + struct PVPSAlias + { + enum class LOCATION_TYPE : uint8 + { + LOCATION_NONE, + LOCATION_GPR, + LOCATION_PVPS, + }; + + LOCATION_TYPE location{ LOCATION_TYPE::LOCATION_NONE }; + uint8 index; // GPR index or temporary index + uint8 aluUnit; // x,y,z,w (or 5 for PS) + + void SetLocationGPR(uint8 gprIndex, uint8 channel) + { + cemu_assert_debug(channel < 4); + this->location = LOCATION_TYPE::LOCATION_GPR; + this->index = gprIndex; + this->aluUnit = channel; + } + + void SetLocationPSPVTemporary(uint8 aluUnit, uint32 groupIndex) + { + cemu_assert_debug(aluUnit < 5); + this->location = LOCATION_TYPE::LOCATION_PVPS; + this->index = groupIndex & 1; + this->aluUnit = aluUnit; + } + }; + + struct GPRTemporary + { + GPRTemporary(uint8 gprIndex, uint8 channel, uint8 backupVarIndex) : gprIndex(gprIndex), channel(channel), backupVarIndex(backupVarIndex) {} + + uint8 gprIndex; + uint8 channel; + uint8 backupVarIndex; + }; + + void TrackGroupOutputPVPS(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstr, size_t numInstr) + { + // unset current + for (auto& it : m_pvps) + it.location = PVPSAlias::LOCATION_TYPE::LOCATION_NONE; + for (size_t i = 0; i < numInstr; i++) + { + LatteDecompilerALUInstruction& inst = aluInstr[i]; + if (!inst.isOP3 && inst.opcode == ALU_OP2_INST_NOP) + continue; // skip NOP instruction + + if (inst.writeMask == 0) + { + // map to temporary + m_pvps[inst.aluUnit].SetLocationPSPVTemporary(inst.aluUnit, aluInstr->instructionGroupIndex); + } + else + { + // map to GPR + if(inst.destRel == 0) // is PV/PS set for indexed writes? + m_pvps[inst.aluUnit].SetLocationGPR(inst.destGpr, inst.destElem); + } + } + } + + bool HasPVPS(uint8 aluUnitIndex) const + { + cemu_assert_debug(aluUnitIndex < 5); + return m_pvps[aluUnitIndex].location != PVPSAlias::LOCATION_TYPE::LOCATION_NONE; + } + + void EmitPVPSAccess(LatteDecompilerShaderContext* shaderContext, uint8 aluUnitIndex, uint32 currentGroupIndex) const + { + switch (m_pvps[aluUnitIndex].location) + { + case PVPSAlias::LOCATION_TYPE::LOCATION_GPR: + { + sint32 temporaryIndex = GetTemporaryForGPR(m_pvps[aluUnitIndex].index, m_pvps[aluUnitIndex].aluUnit); + if (temporaryIndex < 0) + { + shaderContext->shaderSource->add(_getRegisterVarName(shaderContext, m_pvps[aluUnitIndex].index, -1)); + _appendChannelAccess(shaderContext->shaderSource, m_pvps[aluUnitIndex].aluUnit); + } + else + { + // use temporary instead of GPR + shaderContext->shaderSource->addFmt("backupReg{}", temporaryIndex); + _appendRegisterTypeSuffix(shaderContext->shaderSource, shaderContext->typeTracker.defaultDataType); + } + break; + } + case PVPSAlias::LOCATION_TYPE::LOCATION_PVPS: + _appendPVPS(shaderContext, shaderContext->shaderSource, currentGroupIndex-1, m_pvps[aluUnitIndex].aluUnit); + break; + default: + cemuLog_log(LogType::Force, "Shader {:016x} accesses PV/PS without writing to it", shaderContext->shaderBaseHash); + cemu_assert_suspicious(); + break; + } + } + + /* + * Check for GPR channels which are modified before they are read within the same group + * These registers need to be copied to a temporary + */ + void CreateGPRTemporaries(LatteDecompilerShaderContext* shaderContext, std::span aluInstructions) + { + uint8 registerChannelWriteMask[(LATTE_NUM_GPR * 4 + 7) / 8] = { 0 }; + + m_gprTemporaries.clear(); + for (auto& aluInstruction : aluInstructions) + { + // ignore NOP instructions + if (aluInstruction.isOP3 == false && aluInstruction.opcode == ALU_OP2_INST_NOP) + continue; + cemu_assert_debug(aluInstruction.destElem <= 3); + // check if any previously written register is read + for (sint32 f = 0; f < 3; f++) + { + uint32 readGPRIndex; + uint32 readGPRChannel; + if (GPU7_ALU_SRC_IS_GPR(aluInstruction.sourceOperand[f].sel)) + { + readGPRIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction.sourceOperand[f].sel); + cemu_assert_debug(aluInstruction.sourceOperand[f].chan <= 3); + readGPRChannel = aluInstruction.sourceOperand[f].chan; + } + else if (GPU7_ALU_SRC_IS_PV(aluInstruction.sourceOperand[f].sel) || GPU7_ALU_SRC_IS_PS(aluInstruction.sourceOperand[f].sel)) + { + uint8 aluUnitIndex = 0; + if (GPU7_ALU_SRC_IS_PV(aluInstruction.sourceOperand[f].sel)) + aluUnitIndex = aluInstruction.sourceOperand[f].chan; + else + aluUnitIndex = 4; + // if aliased to a GPR, then consider it a GPR read + if(m_pvps[aluUnitIndex].location != PVPSAlias::LOCATION_TYPE::LOCATION_GPR) + continue; + readGPRIndex = m_pvps[aluUnitIndex].index; + readGPRChannel = m_pvps[aluUnitIndex].aluUnit; + } + else + continue; + // track GPR read + if ((registerChannelWriteMask[(readGPRIndex * 4 + aluInstruction.sourceOperand[f].chan) / 8] & (1 << ((readGPRIndex * 4 + aluInstruction.sourceOperand[f].chan) % 8))) != 0) + { + // register is overwritten by previous instruction, a temporary variable is required + if (GetTemporaryForGPR(readGPRIndex, readGPRChannel) < 0) + m_gprTemporaries.emplace_back(readGPRIndex, readGPRChannel, m_gprTemporaries.size()); + } + } + // track write + if (aluInstruction.writeMask != 0) + registerChannelWriteMask[(aluInstruction.destGpr * 4 + aluInstruction.destElem) / 8] |= (1 << ((aluInstruction.destGpr * 4 + aluInstruction.destElem) % 8)); + } + // output code to move GPRs into temporaries + StringBuf* src = shaderContext->shaderSource; + for (auto& it : m_gprTemporaries) + { + src->addFmt("backupReg{}", it.backupVarIndex); + _appendRegisterTypeSuffix(src, shaderContext->typeTracker.defaultDataType); + src->add(" = "); + src->add(_getRegisterVarName(shaderContext, it.gprIndex)); + _appendChannelAccess(src, it.channel); + src->add(";" _CRLF); + } + } + + // returns -1 if none present + sint32 GetTemporaryForGPR(uint8 gprIndex, uint8 channel) const + { + for (auto& it : m_gprTemporaries) + { + if (it.gprIndex == gprIndex && it.channel == channel) + return (sint32)it.backupVarIndex; + } + return -1; + } + +private: + PVPSAlias m_pvps[5]{}; + boost::container::small_vector m_gprTemporaries; +}; + +sint32 _getVertexShaderOutParamSemanticId(uint32* contextRegisters, sint32 index); +sint32 _getInputRegisterDataType(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex); +sint32 _getALUInstructionOutputDataType(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction); +bool _isReductionInstruction(LatteDecompilerALUInstruction* aluInstruction); + +/* + * Writes the name of the output variable and channel + * E.g. R5f.x or tempf.x if writeMask is 0 + */ +static void _emitInstructionOutputVariableName(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction) +{ + auto src = shaderContext->shaderSource; + sint32 outputDataType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); + if( aluInstruction->writeMask == 0 ) + { + // does not output to GPR + if( !_isReductionInstruction(aluInstruction) ) + { + // output to PV/PS + _appendPVPS(shaderContext, src, aluInstruction->instructionGroupIndex, aluInstruction->aluUnit); + return; + } + else + { + // output to temp + src->add("temp"); + _appendRegisterTypeSuffix(src, outputDataType); + } + _appendChannelAccess(src, aluInstruction->aluUnit); + } + else + { + // output to GPR. Aliasing to PV/PS happens at the end of the group + src->add(_getRegisterVarName(shaderContext, aluInstruction->destGpr, aluInstruction->destRel==0?-1:aluInstruction->indexMode)); + _appendChannelAccess(src, aluInstruction->destElem); + } +} + +static void _emitInstructionPVPSOutputVariableName(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction) +{ + _appendPVPS(shaderContext, shaderContext->shaderSource, aluInstruction->instructionGroupIndex, aluInstruction->aluUnit); +} + +static void _emitRegisterAccessCode(LatteDecompilerShaderContext* shaderContext, sint32 gprIndex, sint32 channel0, sint32 channel1, sint32 channel2, sint32 channel3, sint32 dataType = -1) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 registerElementDataType = shaderContext->typeTracker.defaultDataType; + cemu_assert_debug(gprIndex >= 0 && gprIndex <= 127); + if (dataType >= 0) + { + _emitTypeConversionPrefixMSL(shaderContext, registerElementDataType, dataType); + } + if (shaderContext->typeTracker.useArrayGPRs) + src->add("R"); + else + src->addFmt("R{}", gprIndex); + _appendRegisterTypeSuffix(src, registerElementDataType); + if (shaderContext->typeTracker.useArrayGPRs) + src->addFmt("[{}]", gprIndex); + + src->add("."); + + sint32 channelArray[4]; + channelArray[0] = channel0; + channelArray[1] = channel1; + channelArray[2] = channel2; + channelArray[3] = channel3; + + for (sint32 i = 0; i < 4; i++) + { + if (channelArray[i] >= 0 && channelArray[i] <= 3) + src->add(_getElementStrByIndex(channelArray[i])); + else if (channelArray[i] == -1) + { + // channel not used + } + else + { + cemu_assert_unimplemented(); + } + } + if (dataType >= 0) + _emitTypeConversionSuffixMSL(shaderContext, registerElementDataType, dataType); +} + +// optimized variant of _emitRegisterAccessCode for raw one channel reads +static void _emitRegisterChannelAccessCode(LatteDecompilerShaderContext* shaderContext, sint32 gprIndex, sint32 channel, sint32 dataType) +{ + cemu_assert_debug(gprIndex >= 0 && gprIndex <= 127); + cemu_assert_debug(channel >= 0 && channel < 4); + StringBuf* src = shaderContext->shaderSource; + sint32 registerElementDataType = shaderContext->typeTracker.defaultDataType; + _emitTypeConversionPrefixMSL(shaderContext, registerElementDataType, dataType); + if (shaderContext->typeTracker.useArrayGPRs) + src->add("R"); + else + src->addFmt("R{}", gprIndex); + _appendRegisterTypeSuffix(src, registerElementDataType); + if (shaderContext->typeTracker.useArrayGPRs) + src->addFmt("[{}]", gprIndex); + src->add("."); + src->add(_getElementStrByIndex(channel)); + _emitTypeConversionSuffixMSL(shaderContext, registerElementDataType, dataType); +} + +static void _emitALURegisterInputAccessCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + cemu_assert_debug(GPU7_ALU_SRC_IS_GPR(aluInstruction->sourceOperand[operandIndex].sel)); + sint32 gprIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + sint32 temporaryIndex = shaderContext->aluPVPSState->GetTemporaryForGPR(gprIndex, aluInstruction->sourceOperand[operandIndex].chan); + if(temporaryIndex >= 0) + { + // access via backup variable + src->addFmt("backupReg{}", temporaryIndex); + _appendRegisterTypeSuffix(src, currentRegisterElementType); + } + else + { + // access via register variable + _emitRegisterAccessCode(shaderContext, gprIndex, aluInstruction->sourceOperand[operandIndex].chan, -1, -1, -1); + } +} + +static void _emitPVPSAccessCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex, uint8 aluUnitIndex) +{ + cemu_assert_debug(aluInstruction->instructionGroupIndex > 0); // PV/PS is uninitialized for group 0 + // PV/PS vars are currently always using the default type (shaderContext->typeTracker.defaultDataType) + shaderContext->aluPVPSState->EmitPVPSAccess(shaderContext, aluUnitIndex, aluInstruction->instructionGroupIndex); +} + +/* + * Emits the expression used for calculating the index for uniform access + * For static access, this is a number + * For dynamic access, this is AR.* + base + */ +static void _emitUniformAccessIndexCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex) +{ + StringBuf* src = shaderContext->shaderSource; + bool isUniformRegister = GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel); + sint32 uniformOffset = 0; // index into array, for relative accesses this is the base offset + if( isUniformRegister ) + { + uniformOffset = GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + } + else + { + if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction->sourceOperand[operandIndex].sel) ) + { + uniformOffset = GPU7_ALU_SRC_GET_CBANK0_INDEX(aluInstruction->sourceOperand[operandIndex].sel) + aluInstruction->cfInstruction->cBank0AddrBase; + } + else + { + uniformOffset = GPU7_ALU_SRC_GET_CBANK1_INDEX(aluInstruction->sourceOperand[operandIndex].sel) + aluInstruction->cfInstruction->cBank1AddrBase; + } + } + if( aluInstruction->sourceOperand[operandIndex].rel != 0 ) + { + if (aluInstruction->indexMode == GPU7_INDEX_AR_X) + src->addFmt("ARi.x+{}", uniformOffset); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_Y) + src->addFmt("ARi.y+{}", uniformOffset); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_Z) + src->addFmt("ARi.z+{}", uniformOffset); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_W) + src->addFmt("ARi.w+{}", uniformOffset); + else + cemu_assert_unimplemented(); + } + else + { + src->addFmt("{}", uniformOffset); + } +} + +static void _emitUniformAccessCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex, sint32 requiredType) +{ + StringBuf* src = shaderContext->shaderSource; + if(shaderContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED ) + { + // uniform registers or buffers are accessed statically with predictable offsets + // find entry in remapped uniform + if( aluInstruction->sourceOperand[operandIndex].rel != 0 ) + debugBreakpoint(); + bool isUniformRegister = GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel); + sint32 uniformOffset = 0; // index into array + sint32 uniformBufferIndex = 0; + if( isUniformRegister ) + { + uniformOffset = GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + uniformBufferIndex = 0; + } + else + { + if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction->sourceOperand[operandIndex].sel) ) + { + uniformOffset = GPU7_ALU_SRC_GET_CBANK0_INDEX(aluInstruction->sourceOperand[operandIndex].sel) + aluInstruction->cfInstruction->cBank0AddrBase; + uniformBufferIndex = aluInstruction->cfInstruction->cBank0Index; + } + else + { + uniformOffset = GPU7_ALU_SRC_GET_CBANK1_INDEX(aluInstruction->sourceOperand[operandIndex].sel) + aluInstruction->cfInstruction->cBank1AddrBase; + uniformBufferIndex = aluInstruction->cfInstruction->cBank1Index; + } + } + LatteDecompilerRemappedUniformEntry_t* remappedUniformEntry = NULL; + for(size_t i=0; i< shaderContext->shader->list_remappedUniformEntries.size(); i++) + { + LatteDecompilerRemappedUniformEntry_t* remappedUniformEntryItr = shaderContext->shader->list_remappedUniformEntries.data() + i; + if( remappedUniformEntryItr->isRegister && isUniformRegister ) + { + if( remappedUniformEntryItr->index == uniformOffset ) + { + remappedUniformEntry = remappedUniformEntryItr; + break; + } + } + else + { + if( remappedUniformEntryItr->kcacheBankId == uniformBufferIndex && remappedUniformEntryItr->index == uniformOffset ) + { + remappedUniformEntry = remappedUniformEntryItr; + break; + } + } + } + cemu_assert_debug(remappedUniformEntry); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + if(shaderContext->shader->shaderType == LatteConst::ShaderType::Vertex ) + src->addFmt("uf_remappedVS[{}]", remappedUniformEntry->mappedIndex); + else if(shaderContext->shader->shaderType == LatteConst::ShaderType::Pixel ) + src->addFmt("uf_remappedPS[{}]", remappedUniformEntry->mappedIndex); + else if(shaderContext->shader->shaderType == LatteConst::ShaderType::Geometry ) + src->addFmt("uf_remappedGS[{}]", remappedUniformEntry->mappedIndex); + else + debugBreakpoint(); + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + } + else if( shaderContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE ) + { + // uniform registers are accessed with unpredictable (dynamic) offset + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + if(shaderContext->shader->shaderType == LatteConst::ShaderType::Vertex ) + src->add("uf_uniformRegisterVS["); + else if (shaderContext->shader->shaderType == LatteConst::ShaderType::Pixel) + src->add("uf_uniformRegisterPS["); + else if(shaderContext->shader->shaderType == LatteConst::ShaderType::Geometry ) + src->add("uf_uniformRegisterGS["); + else + debugBreakpoint(); + _emitUniformAccessIndexCode(shaderContext, aluInstruction, operandIndex); + src->add("]"); + + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + } + else if( shaderContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK ) + { + // uniform buffers are available as a whole + bool isUniformRegister = GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel); + if( isUniformRegister ) + debugBreakpoint(); + sint32 uniformBufferIndex = 0; + if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction->sourceOperand[operandIndex].sel) ) + { + uniformBufferIndex = aluInstruction->cfInstruction->cBank0Index; + } + else + { + uniformBufferIndex = aluInstruction->cfInstruction->cBank1Index; + } + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + src->addFmt("ubuff{}[", uniformBufferIndex); + _emitUniformAccessIndexCode(shaderContext, aluInstruction, operandIndex); + src->addFmt("]"); + + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + } + else + debugBreakpoint(); +} + +// Generates (slow) code to read an indexed GPR +static void _emitCodeToReadRelativeGPR(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex, sint32 requiredType) +{ + StringBuf* src = shaderContext->shaderSource; + uint32 gprBaseIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + cemu_assert_debug(aluInstruction->sourceOperand[operandIndex].rel != 0); + + if( shaderContext->typeTracker.useArrayGPRs ) + { + _emitTypeConversionPrefixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, requiredType); + src->add(_getRegisterVarName(shaderContext, gprBaseIndex, aluInstruction->indexMode)); + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, requiredType); + return; + } + + char indexAccessCode[64]; + if (aluInstruction->indexMode == GPU7_INDEX_AR_X) + sprintf(indexAccessCode, "ARi.x"); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_Y) + sprintf(indexAccessCode, "ARi.y"); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_Z) + sprintf(indexAccessCode, "ARi.z"); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_W) + sprintf(indexAccessCode, "ARi.w"); + else + cemu_assert_unimplemented(); + + if( LATTE_DECOMPILER_DTYPE_SIGNED_INT != requiredType ) + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + + // generated code looks like this: + // result = ((lookupIndex==0)?GPR5:(lookupIndex==1)?GPR6:(lookupIndex==2)?GPR7:...:(lookupIndex==122)?GPR127:0) + src->add("("); + for(sint32 i=gprBaseIndex; ianalyzer.gprUseMask[i / 8] & (1 << (i % 8))) == 0 ) + continue; + src->addFmt("({}=={})?", indexAccessCode, i-gprBaseIndex); + // code to access gpr + uint32 gprIndex = i; + src->add(_getRegisterVarName(shaderContext, i)); + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + src->add(":"); + } + src->add("0)"); + if( LATTE_DECOMPILER_DTYPE_SIGNED_INT != requiredType ) + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); +} + +static void _emitOperandInputCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex, sint32 requiredType) +{ + StringBuf* src = shaderContext->shaderSource; + if( operandIndex < 0 || operandIndex >= 3 ) + debugBreakpoint(); + sint32 requiredTypeOut = requiredType; + if( requiredType != LATTE_DECOMPILER_DTYPE_FLOAT && (aluInstruction->sourceOperand[operandIndex].abs != 0 || aluInstruction->sourceOperand[operandIndex].neg != 0) ) + { + // we need to apply float operations on the input but it's not read as a float + // force internal required type to float and then cast it back to whatever type is actually required + requiredType = LATTE_DECOMPILER_DTYPE_FLOAT; + } + + if( requiredTypeOut != requiredType ) + _emitTypeConversionPrefixMSL(shaderContext, requiredType, requiredTypeOut); + + if( aluInstruction->sourceOperand[operandIndex].neg != 0 ) + src->add("-("); + if( aluInstruction->sourceOperand[operandIndex].abs != 0 ) + src->add("abs("); + + if( GPU7_ALU_SRC_IS_GPR(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if( aluInstruction->sourceOperand[operandIndex].rel != 0 ) + { + _emitCodeToReadRelativeGPR(shaderContext, aluInstruction, operandIndex, requiredType); + } + else + { + uint32 gprIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + // signed int 32bit + sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + // write code for register input + _emitTypeConversionPrefixMSL(shaderContext, currentRegisterElementType, requiredType); + _emitALURegisterInputAccessCode(shaderContext, aluInstruction, operandIndex); + _emitTypeConversionSuffixMSL(shaderContext, currentRegisterElementType, requiredType); + } + else if( requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT ) + { + // unsigned int 32bit + sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + // need to convert from int to uint + src->add("uint("); + } + else if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT ) + { + // no extra work necessary + } + else + debugBreakpoint(); + // write code for register input + _emitALURegisterInputAccessCode(shaderContext, aluInstruction, operandIndex); + if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + src->add(")"); + } + } + else if( requiredType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + // float 32bit + sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + // need to convert (not cast) from int bits to float + src->add("intBitsToFloat("); + } + else if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + // no extra work necessary + } + else + debugBreakpoint(); + // write code for register input + _emitALURegisterInputAccessCode(shaderContext, aluInstruction, operandIndex); + if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + src->add(")"); + } + } + else + debugBreakpoint(); + } + } + else if( GPU7_ALU_SRC_IS_CONST_0F(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if(requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT || requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) + src->add("0"); + else if( requiredType == LATTE_DECOMPILER_DTYPE_FLOAT ) + src->add("0.0"); + } + else if( GPU7_ALU_SRC_IS_CONST_1F(aluInstruction->sourceOperand[operandIndex].sel) ) + { + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + src->add("1.0"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + } + else if( GPU7_ALU_SRC_IS_CONST_0_5F(aluInstruction->sourceOperand[operandIndex].sel) ) + { + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + src->add("0.5"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + } + else if( GPU7_ALU_SRC_IS_CONST_1I(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if (requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->add("int(1)"); + else if (requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) + src->add("uint(1)"); + else + cemu_assert_suspicious(); + } + else if( GPU7_ALU_SRC_IS_CONST_M1I(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + src->add("int(-1)"); + else + cemu_assert_suspicious(); + } + else if( GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + src->addFmt("0x{:x}", aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]); + else if( requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT ) + src->addFmt("uint(0x{:x})", aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]); + else if (requiredType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + uint32 constVal = aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]; + sint32 exponent = (constVal >> 23) & 0xFF; + exponent -= 127; + if ((constVal & 0xFF) == 0 && exponent >= -10 && exponent <= 10) + { + src->add(_FormatFloatAsConstant(*(float*)&constVal)); + } + else + src->addFmt("intBitsToFloat(0x{:08x})", constVal); + } + } + else if( GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel) ) + { + _emitUniformAccessCode(shaderContext, aluInstruction, operandIndex, requiredType); + } + else if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction->sourceOperand[operandIndex].sel) || + GPU7_ALU_SRC_IS_CBANK1(aluInstruction->sourceOperand[operandIndex].sel) ) + { + _emitUniformAccessCode(shaderContext, aluInstruction, operandIndex, requiredType); + } + else if( GPU7_ALU_SRC_IS_PV(aluInstruction->sourceOperand[operandIndex].sel) ) + { + sint32 currentPVDataType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + _emitTypeConversionPrefixMSL(shaderContext, currentPVDataType, requiredType); + _emitPVPSAccessCode(shaderContext, aluInstruction, operandIndex, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, currentPVDataType, requiredType); + } + else if( GPU7_ALU_SRC_IS_PS(aluInstruction->sourceOperand[operandIndex].sel) ) + { + sint32 currentPSDataType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + _emitTypeConversionPrefixMSL(shaderContext, currentPSDataType, requiredType); + _emitPVPSAccessCode(shaderContext, aluInstruction, operandIndex, 4); + _emitTypeConversionSuffixMSL(shaderContext, currentPSDataType, requiredType); + } + else + { + cemuLog_log(LogType::Force, "Unsupported shader ALU operand sel {:#x}\n", aluInstruction->sourceOperand[operandIndex].sel); + debugBreakpoint(); + } + + if( aluInstruction->sourceOperand[operandIndex].abs != 0 ) + src->add(")"); + if( aluInstruction->sourceOperand[operandIndex].neg != 0 ) + src->add(")"); + + if( requiredTypeOut != requiredType ) + _emitTypeConversionSuffixMSL(shaderContext, requiredType, requiredTypeOut); +} + +void _emitTypeConversionPrefixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType) +{ + if( sourceType == destinationType ) + return; + StringBuf* src = shaderContext->shaderSource; + if (sourceType == LATTE_DECOMPILER_DTYPE_FLOAT && destinationType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->add("as_type("); + else if (sourceType == LATTE_DECOMPILER_DTYPE_FLOAT && destinationType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) + src->add("as_type("); + else if( sourceType == LATTE_DECOMPILER_DTYPE_SIGNED_INT && destinationType == LATTE_DECOMPILER_DTYPE_FLOAT ) + src->add("as_type("); + else if( sourceType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT && destinationType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + src->add("int("); + else if( sourceType == LATTE_DECOMPILER_DTYPE_SIGNED_INT && destinationType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT ) + src->add("uint("); + else + cemu_assert_debug(false); +} + +void _emitTypeConversionSuffixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType) +{ + if( sourceType == destinationType ) + return; + StringBuf* src = shaderContext->shaderSource; + src->add(")"); +} + +template +static void _emitALUOperationBinary(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, const char* operandStr) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, TDataType, outputType); + _emitOperandInputCode(shaderContext, aluInstruction, 0, TDataType); + src->add((char*)operandStr); + _emitOperandInputCode(shaderContext, aluInstruction, 1, TDataType); + _emitTypeConversionSuffixMSL(shaderContext, TDataType, outputType); + src->add(";" _CRLF); +} + +static bool _isSameGPROperand(LatteDecompilerALUInstruction* aluInstruction, sint32 opIndexA, sint32 opIndexB) +{ + if (aluInstruction->sourceOperand[opIndexA].sel != aluInstruction->sourceOperand[opIndexB].sel) + return false; + if (!GPU7_ALU_SRC_IS_GPR(aluInstruction->sourceOperand[opIndexA].sel)) + return false; + if (aluInstruction->sourceOperand[opIndexA].chan != aluInstruction->sourceOperand[opIndexB].chan) + return false; + if (aluInstruction->sourceOperand[opIndexA].abs != aluInstruction->sourceOperand[opIndexB].abs) + return false; + if (aluInstruction->sourceOperand[opIndexA].neg != aluInstruction->sourceOperand[opIndexB].neg) + return false; + if (aluInstruction->sourceOperand[opIndexA].rel != aluInstruction->sourceOperand[opIndexB].rel) + return false; + return true; +} + +static bool _operandHasModifiers(LatteDecompilerALUInstruction* aluInstruction, sint32 opIndex) +{ + return aluInstruction->sourceOperand[opIndex].abs != 0 || aluInstruction->sourceOperand[opIndex].neg != 0; +} + +static void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, LatteDecompilerALUInstruction* aluInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); // data type of output + if( aluInstruction->opcode == ALU_OP2_INST_MOV ) + { + bool requiresFloatMove = false; + requiresFloatMove = aluInstruction->sourceOperand[0].abs != 0 || aluInstruction->sourceOperand[0].neg != 0; + if( requiresFloatMove ) + { + // abs/neg operations are applied to source operand, do float based move + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, outputType); + src->add(";" _CRLF); + } + } + else if( aluInstruction->opcode == ALU_OP2_INST_MOVA_FLOOR ) + { + cemu_assert_debug(aluInstruction->writeMask == 0); + cemu_assert_debug(aluInstruction->omod == 0); + src->add("tempResultf = "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(";" _CRLF); + src->add("tempResultf = floor(tempResultf);" _CRLF); + src->add("tempResultf = clamp(tempResultf, -256.0, 255.0);" _CRLF); + // set AR + if( aluInstruction->destElem == 0 ) + src->add("ARi.x = int(tempResultf);" _CRLF); + else if( aluInstruction->destElem == 1 ) + src->add("ARi.y = int(tempResultf);" _CRLF); + else if( aluInstruction->destElem == 2 ) + src->add("ARi.z = int(tempResultf);" _CRLF); + else + src->add("ARi.w = int(tempResultf);" _CRLF); + // set output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + if( outputType != LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + debugBreakpoint(); // todo + src->add("floatBitsToInt(tempResultf)"); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_MOVA_INT ) + { + cemu_assert_debug(aluInstruction->writeMask == 0); + cemu_assert_debug(aluInstruction->omod == 0); + src->add("tempResulti = "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(";" _CRLF); + src->add("tempResulti = clamp(tempResulti, -256, 255);" _CRLF); + // set AR + if( aluInstruction->destElem == 0 ) + src->add("ARi.x = tempResulti;" _CRLF); + else if( aluInstruction->destElem == 1 ) + src->add("ARi.y = tempResulti;" _CRLF); + else if( aluInstruction->destElem == 2 ) + src->add("ARi.z = tempResulti;" _CRLF); + else + src->add("ARi.w = tempResulti;" _CRLF); + // set output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + if( outputType != LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + debugBreakpoint(); // todo + src->add("tempResulti"); + src->add(";" _CRLF); + + } + else if( aluInstruction->opcode == ALU_OP2_INST_ADD ) + { + _emitALUOperationBinary(shaderContext, aluInstruction, " + "); + } + else if( aluInstruction->opcode == ALU_OP2_INST_MUL ) + { + // 0*anything is always 0 + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + + // if any operand is a non-zero literal or constant we can use standard multiplication + bool useDefaultMul = false; + if (GPU7_ALU_SRC_IS_CONST_0F(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_CONST_0F(aluInstruction->sourceOperand[1].sel)) + { + // result is always zero + src->add("0.0"); + } + else + { + // multiply + if (GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[1].sel) || + GPU7_ALU_SRC_IS_ANY_CONST(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_ANY_CONST(aluInstruction->sourceOperand[1].sel)) + { + useDefaultMul = true; + } + if (shaderContext->options->strictMul && useDefaultMul == false) + { + src->add("mul_nonIEEE("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + else + { + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(" * "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + } + } + + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_MUL_IEEE ) + { + // 0*anything according to IEEE rules + _emitALUOperationBinary(shaderContext, aluInstruction, " * "); + } + else if (aluInstruction->opcode == ALU_OP2_INST_RECIP_IEEE) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("1.0"); + src->add(" / "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if (aluInstruction->opcode == ALU_OP2_INST_RECIP_FF) + { + // untested (BotW bombs) + src->add("tempResultf = 1.0 / ("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(");" _CRLF); + // INF becomes 0.0 + src->add("if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); + // -INF becomes -0.0 + src->add("else if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); + // assign result to output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("tempResultf"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_IEEE || + aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_CLAMPED || + aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_FF ) + { + // todo: This should be correct but testing is needed + src->add("tempResultf = 1.0 / sqrt("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(");" _CRLF); + if (aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_CLAMPED) + { + // note: if( -INF < 0.0 ) does not resolve to true + src->add("if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) != 0 ) tempResultf = -3.40282347E+38F;" _CRLF); + src->add("else if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) == 0 ) tempResultf = 3.40282347E+38F;" _CRLF); + } + else if (aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_FF) + { + // untested (BotW bombs) + src->add("if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); + src->add("else if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); + } + // assign result to output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("tempResultf"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_MAX || + aluInstruction->opcode == ALU_OP2_INST_MIN || + aluInstruction->opcode == ALU_OP2_INST_MAX_DX10 || + aluInstruction->opcode == ALU_OP2_INST_MIN_DX10 ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + if( aluInstruction->opcode == ALU_OP2_INST_MAX ) + src->add("max"); + else if( aluInstruction->opcode == ALU_OP2_INST_MIN ) + src->add("min"); + else if (aluInstruction->opcode == ALU_OP2_INST_MAX_DX10) + src->add("max"); + else if (aluInstruction->opcode == ALU_OP2_INST_MIN_DX10) + src->add("min"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_FLOOR || + aluInstruction->opcode == ALU_OP2_INST_FRACT || + aluInstruction->opcode == ALU_OP2_INST_TRUNC ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + if( aluInstruction->opcode == ALU_OP2_INST_FLOOR ) + src->add("floor"); + else if( aluInstruction->opcode == ALU_OP2_INST_FRACT ) + src->add("fract"); + else + src->add("trunc"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_LOG_CLAMPED || + aluInstruction->opcode == ALU_OP2_INST_LOG_IEEE ) + { + src->add("tempResultf = max(0.0, "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(");" _CRLF); + + src->add("tempResultf = log2(tempResultf);" _CRLF); + if( aluInstruction->opcode == ALU_OP2_INST_LOG_CLAMPED ) + { + src->add("if( isinf(tempResultf) == true ) tempResultf = -3.40282347E+38F;" _CRLF); + } + // assign result to output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("tempResultf"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_RNDNE ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("roundEven"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_EXP_IEEE ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("exp2"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SQRT_IEEE ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("sqrt"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SIN || + aluInstruction->opcode == ALU_OP2_INST_COS ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + if( aluInstruction->opcode == ALU_OP2_INST_SIN ) + src->add("sin"); + else + src->add("cos"); + src->add("(("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")/0.1591549367)"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_FLT_TO_INT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("int"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_FLT_TO_UINT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT, outputType); + src->add("uint"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_INT_TO_FLOAT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("float("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_UINT_TO_FLOAT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("float("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if (aluInstruction->opcode == ALU_OP2_INST_AND_INT) + _emitALUOperationBinary(shaderContext, aluInstruction, " & "); + else if (aluInstruction->opcode == ALU_OP2_INST_OR_INT) + _emitALUOperationBinary(shaderContext, aluInstruction, " | "); + else if (aluInstruction->opcode == ALU_OP2_INST_XOR_INT) + _emitALUOperationBinary(shaderContext, aluInstruction, " ^ "); + else if( aluInstruction->opcode == ALU_OP2_INST_NOT_INT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("~("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_ADD_INT ) + _emitALUOperationBinary(shaderContext, aluInstruction, " + "); + else if( aluInstruction->opcode == ALU_OP2_INST_MAX_INT || aluInstruction->opcode == ALU_OP2_INST_MIN_INT ) + { + // not verified + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + if( aluInstruction->opcode == ALU_OP2_INST_MAX_INT ) + src->add(" = max("); + else + src->add(" = min("); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(", "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(");" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SUB_INT ) + { + // note: The AMD doc says src1 is on the left side but tests indicate otherwise. It's src0 - src1. + _emitALUOperationBinary(shaderContext, aluInstruction, " - "); + } + else if (aluInstruction->opcode == ALU_OP2_INST_MULLO_INT) + _emitALUOperationBinary(shaderContext, aluInstruction, " * "); + else if (aluInstruction->opcode == ALU_OP2_INST_MULLO_UINT) + _emitALUOperationBinary(shaderContext, aluInstruction, " * "); + else if( aluInstruction->opcode == ALU_OP2_INST_LSHL_INT ) + _emitALUOperationBinary(shaderContext, aluInstruction, " << "); + else if( aluInstruction->opcode == ALU_OP2_INST_LSHR_INT ) + _emitALUOperationBinary(shaderContext, aluInstruction, " >> "); + else if( aluInstruction->opcode == ALU_OP2_INST_ASHR_INT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(" >> "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT || + aluInstruction->opcode == ALU_OP2_INST_SETGE || + aluInstruction->opcode == ALU_OP2_INST_SETNE || + aluInstruction->opcode == ALU_OP2_INST_SETE ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + if( aluInstruction->opcode == ALU_OP2_INST_SETGT ) + src->add(" > "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGE ) + src->add(" >= "); + else if (aluInstruction->opcode == ALU_OP2_INST_SETNE) + src->add(" != "); + else if (aluInstruction->opcode == ALU_OP2_INST_SETE) + src->add(" == "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")?1.0:0.0"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT_DX10 || + aluInstruction->opcode == ALU_OP2_INST_SETE_DX10 || + aluInstruction->opcode == ALU_OP2_INST_SETNE_DX10 || + aluInstruction->opcode == ALU_OP2_INST_SETGE_DX10 ) + { + if( aluInstruction->omod != 0 ) + debugBreakpoint(); + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("(("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + if( aluInstruction->opcode == ALU_OP2_INST_SETE_DX10 ) + src->add(" == "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETNE_DX10 ) + src->add(" != "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT_DX10 ) + src->add(" > "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGE_DX10 ) + src->add(" >= "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")?-1:0)"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";"); + src->add(_CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SETE_INT || + aluInstruction->opcode == ALU_OP2_INST_SETNE_INT || + aluInstruction->opcode == ALU_OP2_INST_SETGT_INT || + aluInstruction->opcode == ALU_OP2_INST_SETGE_INT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + if( aluInstruction->opcode == ALU_OP2_INST_SETE_INT ) + src->add(" == "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETNE_INT ) + src->add(" != "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT_INT ) + src->add(" > "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGE_INT ) + src->add(" >= "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")?-1:0"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SETGE_UINT || + aluInstruction->opcode == ALU_OP2_INST_SETGT_UINT ) + { + // todo: Unsure if the result is unsigned or signed + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT); + if( aluInstruction->opcode == ALU_OP2_INST_SETGE_UINT ) + src->add(" >= "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT_UINT ) + src->add(" > "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT); + src->add(")?int(0xFFFFFFFF):int(0x0)"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETE || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE_INT || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETE_INT || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE_INT || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT_INT ) + { + cemu_assert_debug(aluInstruction->writeMask == 0); + bool isIntPred = (aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE_INT) || (aluInstruction->opcode == ALU_OP2_INST_PRED_SETE_INT) || (aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE_INT) || (aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT_INT); + + src->add("predResult"); + src->add(" = ("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, isIntPred?LATTE_DECOMPILER_DTYPE_SIGNED_INT:LATTE_DECOMPILER_DTYPE_FLOAT); + + if (aluInstruction->opcode == ALU_OP2_INST_PRED_SETE || aluInstruction->opcode == ALU_OP2_INST_PRED_SETE_INT) + src->add(" == "); + else if (aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT || aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT_INT) + src->add(" > "); + else if (aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE || aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE_INT) + src->add(" >= "); + else if (aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE || aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE_INT) + src->add(" != "); + else + cemu_assert_debug(false); + + _emitOperandInputCode(shaderContext, aluInstruction, 1, isIntPred?LATTE_DECOMPILER_DTYPE_SIGNED_INT:LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(");" _CRLF); + // handle result of predicate instruction based on current ALU clause type + if( cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE ) + { + src->addFmt("{} = predResult;" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth)); + src->addFmt("{} = predResult == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth)); + } + else if( cfInstruction->type == GPU7_CF_INST_ALU_BREAK ) + { + // leave current loop + src->add("if( predResult == false ) break;" _CRLF); + } + else + cemu_assert_debug(false); + } + else if( aluInstruction->opcode == ALU_OP2_INST_KILLE_INT || + aluInstruction->opcode == ALU_OP2_INST_KILLNE_INT || + aluInstruction->opcode == ALU_OP2_INST_KILLGT_INT) + { + src->add("if( "); + src->add(" ("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + if( aluInstruction->opcode == ALU_OP2_INST_KILLE_INT ) + src->add(" == "); + else if (aluInstruction->opcode == ALU_OP2_INST_KILLNE_INT) + src->add(" != "); + else if (aluInstruction->opcode == ALU_OP2_INST_KILLGT_INT) + src->add(" > "); + else + debugBreakpoint(); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + src->add(") discard;"); + src->add(_CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_KILLGT || + aluInstruction->opcode == ALU_OP2_INST_KILLGE || + aluInstruction->opcode == ALU_OP2_INST_KILLE ) + { + src->add("if( "); + src->add(" ("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + if( aluInstruction->opcode == ALU_OP2_INST_KILLGT ) + src->add(" > "); + else if( aluInstruction->opcode == ALU_OP2_INST_KILLGE ) + src->add(" >= "); + else if( aluInstruction->opcode == ALU_OP2_INST_KILLE ) + src->add(" == "); + else + debugBreakpoint(); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + src->add(") discard;"); + src->add(_CRLF); + } + else + { + src->add("Unsupported instruction;" _CRLF); + debug_printf("Unsupported ALU op2 instruction 0x%x\n", aluInstruction->opcode); + shaderContext->shader->hasError = true; + } +} + +static void _emitALUOP3InstructionCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, LatteDecompilerALUInstruction* aluInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + cemu_assert_debug(aluInstruction->destRel == 0); // todo + sint32 outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); + + /* check for common no-op or mov-like instructions */ + if (aluInstruction->opcode == ALU_OP3_INST_CMOVGE || + aluInstruction->opcode == ALU_OP3_INST_CMOVE || + aluInstruction->opcode == ALU_OP3_INST_CMOVGT || + aluInstruction->opcode == ALU_OP3_INST_CNDE_INT || + aluInstruction->opcode == ALU_OP3_INST_CNDGT_INT || + aluInstruction->opcode == ALU_OP3_INST_CMOVGE_INT) + { + if (_isSameGPROperand(aluInstruction, 1, 2) && !_operandHasModifiers(aluInstruction, 1)) + { + // the condition is irrelevant as both operands are the same + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, outputType); + src->add(";" _CRLF); + return; + } + } + + + /* generic handlers */ + if( aluInstruction->opcode == ALU_OP3_INST_MULADD || + aluInstruction->opcode == ALU_OP3_INST_MULADD_D2 || + aluInstruction->opcode == ALU_OP3_INST_MULADD_M2 || + aluInstruction->opcode == ALU_OP3_INST_MULADD_M4 || + aluInstruction->opcode == ALU_OP3_INST_MULADD_IEEE ) + { + // todo: The difference between MULADD and MULADD IEEE is that the former has 0*anything=0 rule similar to MUL/MUL_IEEE? + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + if (aluInstruction->opcode != ALU_OP3_INST_MULADD) // avoid unnecessary parenthesis to improve code readability slightly + src->add("("); + + bool useDefaultMul = false; + if (GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[1].sel) || + GPU7_ALU_SRC_IS_ANY_CONST(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_ANY_CONST(aluInstruction->sourceOperand[1].sel)) + { + useDefaultMul = true; + } + if (aluInstruction->opcode == ALU_OP3_INST_MULADD_IEEE) + useDefaultMul = true; + + if (shaderContext->options->strictMul && useDefaultMul == false) + { + src->add("mul_nonIEEE("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + else + { + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(" * "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + } + + src->add(" + "); + _emitOperandInputCode(shaderContext, aluInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + if(aluInstruction->opcode != ALU_OP3_INST_MULADD) + src->add(")"); + if( aluInstruction->opcode == ALU_OP3_INST_MULADD_D2 ) + src->add("/2.0"); + else if( aluInstruction->opcode == ALU_OP3_INST_MULADD_M2 ) + src->add("*2.0"); + else if( aluInstruction->opcode == ALU_OP3_INST_MULADD_M4 ) + src->add("*4.0"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if(aluInstruction->opcode == ALU_OP3_INST_CNDE_INT || aluInstruction->opcode == ALU_OP3_INST_CNDGT_INT || aluInstruction->opcode == ALU_OP3_INST_CMOVGE_INT) + { + bool requiresFloatResult = (aluInstruction->sourceOperand[1].neg != 0) || (aluInstruction->sourceOperand[2].neg != 0); + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, requiresFloatResult?LATTE_DECOMPILER_DTYPE_FLOAT:LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("(("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + if (aluInstruction->opcode == ALU_OP3_INST_CNDE_INT) + src->add(" == "); + else if (aluInstruction->opcode == ALU_OP3_INST_CNDGT_INT) + src->add(" > "); + else if (aluInstruction->opcode == ALU_OP3_INST_CMOVGE_INT) + src->add(" >= "); + src->add("0)?("); + + _emitOperandInputCode(shaderContext, aluInstruction, 1, requiresFloatResult?LATTE_DECOMPILER_DTYPE_FLOAT:LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add("):("); + _emitOperandInputCode(shaderContext, aluInstruction, 2, requiresFloatResult?LATTE_DECOMPILER_DTYPE_FLOAT:LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add("))"); + _emitTypeConversionSuffixMSL(shaderContext, requiresFloatResult?LATTE_DECOMPILER_DTYPE_FLOAT:LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP3_INST_CMOVGE || + aluInstruction->opcode == ALU_OP3_INST_CMOVE || + aluInstruction->opcode == ALU_OP3_INST_CMOVGT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("(("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + if (aluInstruction->opcode == ALU_OP3_INST_CMOVE) + src->add(" == "); + else if (aluInstruction->opcode == ALU_OP3_INST_CMOVGE) + src->add(" >= "); + else if (aluInstruction->opcode == ALU_OP3_INST_CMOVGT) + src->add(" > "); + src->add("0.0)?("); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add("):("); + _emitOperandInputCode(shaderContext, aluInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add("))"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else + { + src->add("Unsupported instruction;" _CRLF); + debug_printf("Unsupported ALU op3 instruction 0x%x\n", aluInstruction->opcode); + shaderContext->shader->hasError = true; + } +} + +static void _emitALUReductionInstructionCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluRedcInstruction[4]) +{ + StringBuf* src = shaderContext->shaderSource; + if( aluRedcInstruction[0]->isOP3 == false && (aluRedcInstruction[0]->opcode == ALU_OP2_INST_DOT4 || aluRedcInstruction[0]->opcode == ALU_OP2_INST_DOT4_IEEE) ) + { + // todo: Figure out and implement the difference between normal DOT4 and DOT4_IEEE + sint32 outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[0]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + + // dot(vec4(op0),vec4(op1)) + src->add("dot(vec4("); + _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[2], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[3], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("),vec4("); + _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[2], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[3], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("))"); + + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluRedcInstruction[0]->isOP3 == false && (aluRedcInstruction[0]->opcode == ALU_OP2_INST_CUBE) ) + { + /* + * How the CUBE instruction works (guessed mostly, based on DirectX/OpenGL spec): + Input: vec4, 3d direction vector (can be unnormalized) + w component (which can be ignored, since it only scales the vector but does not affect the direction) + + First we figure out the major axis (closest axis-aligned vector). There are six possible vectors: + +rx 0 + -rx 1 + +ry 2 + -ry 3 + +rz 4 + -rz 5 + The major axis vector is calculated by looking at the largest (absolute) 3d vector component and then setting the other components to 0.0 + The value that remains in the axis vector is referred to as 'MajorAxis' by the AMD documentation. + The S,T coordinates are taken from the other two components. + Example: -0.5,0.2,0.4 -> -rx -> -0.5,0.0,0.0 MajorAxis: -0.5, S: 0.2 T: 0.4 + + The CUBE reduction instruction requires a specific mapping for the input vector: + src0 = Rn.zzxy + src1 = Rn.yxzz + It's probably related to the way the instruction works internally? + If we look at the individual components per ALU unit: + z y -> Compare y/z + z x -> Compare x/z + x z -> Compare x/z + y z -> Compare y/z + */ + + sint32 outputType; + + src->add("redcCUBE("); + src->add("vec4("); + _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[2], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[3], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("),"); + src->add("vec4("); + _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[2], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[3], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("),"); + src->add("cubeMapSTM,cubeMapFaceId);" _CRLF); + + // dst.X (S) + outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[0]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("cubeMapSTM.x"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + // dst.Y (T) + outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[1]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[1]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("cubeMapSTM.y"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + // dst.Z (MajorAxis) + outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[2]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[2]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("cubeMapSTM.z"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + // dst.W (FaceId) + outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[3]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[3]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("cubeMapFaceId"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else + cemu_assert_unimplemented(); +} + +static void _emitALUClauseRegisterBackupCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, sint32 startIndex) +{ + sint32 instructionGroupIndex = cfInstruction->instructionsALU[startIndex].instructionGroupIndex; + size_t groupSize = 1; + while ((startIndex + groupSize) < cfInstruction->instructionsALU.size()) + { + if (instructionGroupIndex != cfInstruction->instructionsALU[startIndex + groupSize].instructionGroupIndex) + break; + groupSize++; + } + shaderContext->aluPVPSState->CreateGPRTemporaries(shaderContext, { cfInstruction->instructionsALU.data() + startIndex, groupSize }); +} + +/* +bool _isPVUsedInNextGroup(LatteDecompilerCFInstruction* cfInstruction, sint32 startIndex, sint32 pvUnit) +{ + sint32 currentGroupIndex = cfInstruction->instructionsALU[startIndex].instructionGroupIndex; + for (sint32 i = startIndex + 1; i < (sint32)cfInstruction->instructionsALU.size(); i++) + { + LatteDecompilerALUInstruction& aluInstructionItr = cfInstruction->instructionsALU[i]; + if(aluInstructionItr.instructionGroupIndex == currentGroupIndex ) + continue; + if ((sint32)aluInstructionItr.instructionGroupIndex > currentGroupIndex + 1) + return false; + // check OP code type + if (aluInstructionItr.isOP3) + { + // op0 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[0].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[0].chan; + if (pvUnit == chan) + return true; + } + // op1 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[1].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[1].chan; + if (pvUnit == chan) + return true; + } + // op2 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[2].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[2].chan; + if (pvUnit == chan) + return true; + } + } + else + { + // op0 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[0].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[0].chan; + if (pvUnit == chan) + return true; + } + // op1 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[1].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[1].chan; + if (pvUnit == chan) + return true; + } + // todo: Not all operations use both operands + } + } + return false; +} +*/ + +static void _emitVec3(LatteDecompilerShaderContext* shaderContext, uint32 dataType, LatteDecompilerALUInstruction* aluInst0, sint32 opIdx0, LatteDecompilerALUInstruction* aluInst1, sint32 opIdx1, LatteDecompilerALUInstruction* aluInst2, sint32 opIdx2) +{ + StringBuf* src = shaderContext->shaderSource; + if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + src->add("vec3("); + _emitOperandInputCode(shaderContext, aluInst0, opIdx0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInst1, opIdx1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInst2, opIdx2, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + else if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + src->add("ivec3("); + _emitOperandInputCode(shaderContext, aluInst0, opIdx0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInst1, opIdx1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInst2, opIdx2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + } + else + cemu_assert_unimplemented(); +} + +static void _emitGPRVectorAssignment(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction** aluInstructions, sint32 count) +{ + StringBuf* src = shaderContext->shaderSource; + // output var name (GPR) + src->add(_getRegisterVarName(shaderContext, aluInstructions[0]->destGpr, -1)); + src->add("."); + for (sint32 f = 0; f < count; f++) + { + src->add(_getElementStrByIndex(aluInstructions[f]->destElem)); + } + src->add(" = "); +} + +static void _emitALUClauseCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + ALUClauseTemporariesState pvpsState; + shaderContext->aluPVPSState = &pvpsState; + StringBuf* src = shaderContext->shaderSource; + LatteDecompilerALUInstruction* aluRedcInstruction[4]; + size_t groupStartIndex = 0; + for(size_t i=0; iinstructionsALU.size(); i++) + { + LatteDecompilerALUInstruction& aluInstruction = cfInstruction->instructionsALU[i]; + if( aluInstruction.indexInGroup == 0 ) + { + src->addFmt("// {}" _CRLF, aluInstruction.instructionGroupIndex); + // apply PV/PS updates for previous group + if (i > 0) + { + pvpsState.TrackGroupOutputPVPS(shaderContext, cfInstruction->instructionsALU.data() + groupStartIndex, i - groupStartIndex); + } + groupStartIndex = i; + // backup registers which are read after being written + _emitALUClauseRegisterBackupCode(shaderContext, cfInstruction, i); + } + // detect reduction instructions and use a special handler + bool isReductionOperation = _isReductionInstruction(&aluInstruction); + if( isReductionOperation ) + { + cemu_assert_debug((i + 4) <= cfInstruction->instructionsALU.size()); + aluRedcInstruction[0] = &aluInstruction; + aluRedcInstruction[1] = &cfInstruction->instructionsALU[i + 1]; + aluRedcInstruction[2] = &cfInstruction->instructionsALU[i + 2]; + aluRedcInstruction[3] = &cfInstruction->instructionsALU[i + 3]; + if( aluRedcInstruction[0]->isOP3 != aluRedcInstruction[1]->isOP3 || aluRedcInstruction[1]->isOP3 != aluRedcInstruction[2]->isOP3 || aluRedcInstruction[2]->isOP3 != aluRedcInstruction[3]->isOP3 ) + debugBreakpoint(); + if( aluRedcInstruction[0]->opcode != aluRedcInstruction[1]->opcode || aluRedcInstruction[1]->opcode != aluRedcInstruction[2]->opcode || aluRedcInstruction[2]->opcode != aluRedcInstruction[3]->opcode ) + debugBreakpoint(); + if( aluRedcInstruction[0]->omod != aluRedcInstruction[1]->omod || aluRedcInstruction[1]->omod != aluRedcInstruction[2]->omod || aluRedcInstruction[2]->omod != aluRedcInstruction[3]->omod ) + debugBreakpoint(); + if( aluRedcInstruction[0]->destClamp != aluRedcInstruction[1]->destClamp || aluRedcInstruction[1]->destClamp != aluRedcInstruction[2]->destClamp || aluRedcInstruction[2]->destClamp != aluRedcInstruction[3]->destClamp ) + debugBreakpoint(); + _emitALUReductionInstructionCode(shaderContext, aluRedcInstruction); + i += 3; // skip the instructions that are part of the reduction operation + } + else /* not a reduction operation */ + { + if( aluInstruction.isOP3 ) + { + // op3 + _emitALUOP3InstructionCode(shaderContext, cfInstruction, &aluInstruction); + } + else + { + // op2 + if( aluInstruction.opcode == ALU_OP2_INST_NOP ) + continue; // skip NOP instruction + _emitALUOP2InstructionCode(shaderContext, cfInstruction, &aluInstruction); + } + } + // handle omod + sint32 outputDataType = _getALUInstructionOutputDataType(shaderContext, &aluInstruction); + if( aluInstruction.omod != ALU_OMOD_NONE ) + { + if( outputDataType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + if( aluInstruction.omod == ALU_OMOD_MUL2 ) + src->add(" *= 2.0;" _CRLF); + else if( aluInstruction.omod == ALU_OMOD_MUL4 ) + src->add(" *= 4.0;" _CRLF); + else if( aluInstruction.omod == ALU_OMOD_DIV2 ) + src->add(" /= 2.0;" _CRLF); + } + else if( outputDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(" = "); + src->add("floatBitsToInt(intBitsToFloat("); + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(")"); + if( aluInstruction.omod == 1 ) + src->add(" * 2.0"); + else if( aluInstruction.omod == 2 ) + src->add(" * 4.0"); + else if( aluInstruction.omod == 3 ) + src->add(" / 2.0"); + src->add(");" _CRLF); + } + else + { + cemu_assert_unimplemented(); + } + } + // handle clamp + if( aluInstruction.destClamp != 0 ) + { + if( outputDataType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(" = clamp("); + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(", 0.0, 1.0);" _CRLF); + } + else if( outputDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(" = clampFI32("); + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(");" _CRLF); + } + else + { + cemu_assert_unimplemented(); + } + } + // handle result broadcasting for reduction instructions + if( isReductionOperation ) + { + // reduction operations set all four PV components (todo: Needs further research. According to AMD docs, dot4 only sets PV.x? update: Unlike DOT4, CUBE sets all PV elements accordingly to their GPR output?) + if( aluRedcInstruction[0]->opcode == ALU_OP2_INST_CUBE ) + { + // CUBE + for (sint32 f = 0; f < 4; f++) + { + if (aluRedcInstruction[f]->writeMask != 0) + continue; + _emitInstructionPVPSOutputVariableName(shaderContext, aluRedcInstruction[f]); + src->add(" = "); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); + src->add(";" _CRLF); + } + } + else + { + // DOT4, DOT4_IEEE, etc. + // reduction operation result is only set for output in redc[0], we also need to update redc[1] to redc[3] + for(sint32 f=0; f<4; f++) + { + if( aluRedcInstruction[f]->writeMask == 0 ) + _emitInstructionPVPSOutputVariableName(shaderContext, aluRedcInstruction[f]); + else + { + if (f == 0) + continue; + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[f]); + } + src->add(" = "); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); + src->add(";" _CRLF); + } + } + } + } + shaderContext->aluPVPSState = nullptr; +} + +/* + * Emits code to access one component (xyzw) of the texture coordinate input vector + */ +static void _emitTEXSampleCoordInputComponent(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction, sint32 componentIndex, sint32 interpretSrcAsType) +{ + cemu_assert(componentIndex >= 0 && componentIndex < 4); + cemu_assert_debug(interpretSrcAsType == LATTE_DECOMPILER_DTYPE_SIGNED_INT || interpretSrcAsType == LATTE_DECOMPILER_DTYPE_FLOAT); + StringBuf* src = shaderContext->shaderSource; + sint32 elementSel = texInstruction->textureFetch.srcSel[componentIndex]; + if (elementSel < 4) + { + _emitRegisterChannelAccessCode(shaderContext, texInstruction->srcGpr, elementSel, interpretSrcAsType); + return; + } + const char* resultElemTable[4] = {"x","y","z","w"}; + if(interpretSrcAsType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + if( elementSel == 4 ) + src->add("floatBitsToInt(0.0)"); + else if( elementSel == 5 ) + src->add("floatBitsToInt(1.0)"); + } + else if(interpretSrcAsType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + if( elementSel == 4 ) + src->add("0.0"); + else if( elementSel == 5 ) + src->add("1.0"); + } +} + +static const char* _texGprAccessElemTable[8] = {"x","y","z","w","_","_","_","_"}; + +static char* _getTexGPRAccess(LatteDecompilerShaderContext* shaderContext, sint32 gprIndex, uint32 dataType, sint8 selX, sint8 selY, sint8 selZ, sint8 selW, char* tempBuffer) +{ + // intBitsToFloat(R{}i.w) + *tempBuffer = '\0'; + uint8 elemCount = (selX > 0 ? 1 : 0) + (selY > 0 ? 1 : 0) + (selZ > 0 ? 1 : 0) + (selW > 0 ? 1 : 0); + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + ; // no conversion + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + strcat(tempBuffer, "intBitsToFloat("); + else + cemu_assert_unimplemented(); + strcat(tempBuffer, _getRegisterVarName(shaderContext, gprIndex)); + // _texGprAccessElemTable + strcat(tempBuffer, "."); + if (selX >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selX]); + if (selY >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selY]); + if (selZ >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selZ]); + if (selW >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selW]); + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + ; // no conversion + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + strcat(tempBuffer, ")"); + else + cemu_assert_unimplemented(); + } + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + cemu_assert_unimplemented(); + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + ; // no conversion + else + cemu_assert_unimplemented(); + strcat(tempBuffer, _getRegisterVarName(shaderContext, gprIndex)); + // _texGprAccessElemTable + strcat(tempBuffer, "."); + if (selX >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selX]); + if (selY >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selY]); + if (selZ >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selZ]); + if (selW >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selW]); + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + cemu_assert_unimplemented(); + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + ; // no conversion + else + cemu_assert_unimplemented(); + } + else + cemu_assert_unimplemented(); + return tempBuffer; +} + +static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + if (texInstruction->textureFetch.textureIndex < 0 || texInstruction->textureFetch.textureIndex >= LATTE_NUM_MAX_TEX_UNITS) + { + // skip out of bounds texture unit access + return; + } + + auto texDim = shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex]; + + char tempBuffer0[32]; + char tempBuffer1[32]; + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + const char* resultElemTable[4] = {"x","y","z","w"}; + sint32 numWrittenElements = 0; + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + // texture sampler opcode + uint32 texOpcode = texInstruction->opcode; + if (shaderContext->shaderType == LatteConst::ShaderType::Vertex) + { + // vertex shader forces LOD to zero, but certain sampler types don't support textureLod(...) API + if (texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) + texOpcode = GPU7_TEX_INST_SAMPLE_C; + } + // check if offset is used + bool hasOffset = false; + if( texInstruction->textureFetch.offsetX != 0 || texInstruction->textureFetch.offsetY != 0 || texInstruction->textureFetch.offsetZ != 0 ) + hasOffset = true; + // emit sample code + if (shaderContext->shader->textureIsIntegerFormat[texInstruction->textureFetch.textureIndex]) + { + // integer samplers + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) // uint to int + { + if(numWrittenElements == 1) + src->add(" = int("); + else + shaderContext->shaderSource->addFmt(" = ivec{}(", numWrittenElements); + } + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->add(" = uintBitsToFloat("); + } + else + { + // float samplers + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->add(" = floatBitsToInt("); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->add(" = ("); + } + + bool unnormalizationHandled = false; + bool useTexelCoordinates = false; + + // handle illegal combinations + if (texOpcode == GPU7_TEX_INST_FETCH4 && (texDim == Latte::E_DIM::DIM_1D || texDim == Latte::E_DIM::DIM_1D_ARRAY)) + { + // fetch4 is not allowed on 1D textures + // seen in YWW during boss fight of Level 1-4 + // todo - investigate what this returns on actual HW + if (numWrittenElements == 1) + shaderContext->shaderSource->add("0.0"); + else + shaderContext->shaderSource->addFmt("vec{}(0.0)", numWrittenElements); + shaderContext->shaderSource->add(");" _CRLF); + return; + } + + + if (texOpcode == GPU7_TEX_INST_SAMPLE && (texInstruction->textureFetch.unnormalized[0] && texInstruction->textureFetch.unnormalized[1] && texInstruction->textureFetch.unnormalized[2] && texInstruction->textureFetch.unnormalized[3]) ) + { + // texture is likely a RECT + if (hasOffset) + cemu_assert_unimplemented(); + src->add("texelFetch("); + unnormalizationHandled = true; + useTexelCoordinates = true; + } + else if( texOpcode == GPU7_TEX_INST_FETCH4 ) + { + if( hasOffset ) + cemu_assert_unimplemented(); + src->add("textureGather("); + } + else if( texOpcode == GPU7_TEX_INST_LD ) + { + if( hasOffset ) + cemu_assert_unimplemented(); + src->add("texelFetch("); + unnormalizationHandled = true; + useTexelCoordinates = true; + } + else if( texOpcode == GPU7_TEX_INST_SAMPLE_L ) + { + // sample with LOD value set in gpr.w (replaces computed LOD value) + if( hasOffset ) + src->add("textureLodOffset("); + else + src->add("textureLod("); + } + else if (texOpcode == GPU7_TEX_INST_SAMPLE_LZ) + { + // sample with LOD set to 0.0 (replaces computed LOD value) + if (hasOffset) + src->add("textureLodOffset("); + else + src->add("textureLod("); + } + else if (texOpcode == GPU7_TEX_INST_SAMPLE_LB) + { + // sample with LOD biased + // note: AMD doc says LOD bias is calculated from instruction LOD_BIAS field. But it appears that LOD bias is taken from input register. Might actually be both? + if (hasOffset) + src->add("textureOffset("); + else + src->add("texture("); + } + else if (texOpcode == GPU7_TEX_INST_SAMPLE) + { + if (hasOffset) + src->add("textureOffset("); + else + src->add("texture("); + } + else if (texOpcode == GPU7_TEX_INST_SAMPLE_C_L) + { + // sample with LOD value set in gpr.w (replaces computed LOD value) + if (hasOffset) + src->add("textureLodOffset("); + else + src->add("textureLod("); + } + else if (texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) + { + // sample with LOD set to 0.0 (replaces computed LOD value) + if (hasOffset) + src->add("textureLodOffset("); + else + src->add("textureLod("); + } + else if (texOpcode == GPU7_TEX_INST_SAMPLE_C) + { + if (hasOffset) + src->add("textureOffset("); + else + src->add("texture("); + } + else if (texOpcode == GPU7_TEX_INST_SAMPLE_G) + { + if (hasOffset) + cemu_assert_unimplemented(); + src->add("textureGrad("); + } + else + { + if( hasOffset ) + cemu_assert_unimplemented(); + cemu_assert_unimplemented(); + src->add("texture("); + } + src->addFmt("tex{}, ", texInstruction->textureFetch.textureIndex); + + // for textureGather() add shift (todo: depends on rounding mode set in sampler registers?) + if (texOpcode == GPU7_TEX_INST_FETCH4) + { + if (texDim == Latte::E_DIM::DIM_2D) + { + //src->addFmt2("(vec2(-0.1) / vec2(textureSize(tex{},0).xy)) + ", texInstruction->textureIndex); + + // vec2(-0.00001) is minimum to break Nvidia + // vec2(0.0001) is minimum to fix shadows on Intel, also fixes it on AMD (Windows and Linux) + + // todo - emulating coordinate rounding mode correctly is tricky + // GX2 supports two modes: Truncate or rounding according to DX9 rules + // Vulkan uses truncate mode when point sampling (min and mag is both nearest) otherwise it uses rounding + + // adding a small fixed bias is enough to avoid vendor-specific cases where small inaccuracies cause the number to get rounded down due to truncation + src->addFmt("vec2(0.0001) + "); + } + } + + const sint32 texCoordDataType = (texOpcode == GPU7_TEX_INST_LD) ? LATTE_DECOMPILER_DTYPE_SIGNED_INT : LATTE_DECOMPILER_DTYPE_FLOAT; + if(useTexelCoordinates) + { + // handle integer coordinates for texelFetch + if (texDim == Latte::E_DIM::DIM_2D || texDim == Latte::E_DIM::DIM_2D_MSAA) + { + src->add("ivec2("); + src->add("vec2("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, texCoordDataType); + src->addFmt(", "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, texCoordDataType); + + src->addFmt(")*uf_tex{}Scale", texInstruction->textureFetch.textureIndex); // close vec2 and scale + + src->add("), 0"); // close ivec2 and lod param + // todo - lod + } + else if (texDim == Latte::E_DIM::DIM_1D) + { + // VC DS games forget to initialize textures and use texel fetch on an uninitialized texture (a dim of 0 maps to 1D) + src->add("int("); + src->add("float("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, (texOpcode == GPU7_TEX_INST_LD) ? LATTE_DECOMPILER_DTYPE_SIGNED_INT : LATTE_DECOMPILER_DTYPE_FLOAT); + src->addFmt(")*uf_tex{}Scale.x", texInstruction->textureFetch.textureIndex); + src->add("), 0"); + // todo - lod + } + else + cemu_assert_debug(false); + } + else /* useTexelCoordinates == false */ + { + // float coordinates + if ( (texOpcode == GPU7_TEX_INST_SAMPLE_C || texOpcode == GPU7_TEX_INST_SAMPLE_C_L || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) ) + { + // shadow sampler + if (texDim == Latte::E_DIM::DIM_2D_ARRAY) + { + // 3 coords + compare value (as vec4) + src->add("vec4("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + + src->addFmt(",{})", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer0)); + } + else if (texDim == Latte::E_DIM::DIM_CUBEMAP) + { + // 2 coords + faceId + if (texInstruction->textureFetch.srcSel[0] >= 4 || texInstruction->textureFetch.srcSel[1] >= 4) + { + debugBreakpoint(); + } + src->add("vec4("); + src->addFmt("redcCUBEReverse({},", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0)); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->addFmt(")"); + src->addFmt(",cubeMapArrayIndex{})", texInstruction->textureFetch.textureIndex); // cubemap index + } + else if (texDim == Latte::E_DIM::DIM_1D) + { + // 1 coord + 1 unused coord (per spec) + compare value + if (texInstruction->textureFetch.srcSel[0] >= 4) + { + debugBreakpoint(); + } + src->addFmt("vec3({},0.0,{})", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], -1, -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); + } + else + { + // 2 coords + compare value (as vec3) + if (texInstruction->textureFetch.srcSel[0] >= 4 && texInstruction->textureFetch.srcSel[1] >= 4) + { + debugBreakpoint(); + } + src->addFmt("vec3({}, {})", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); + } + } + else if( texDim == Latte::E_DIM::DIM_3D || texDim == Latte::E_DIM::DIM_2D_ARRAY ) + { + // 3 coords + src->add("vec3("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + else if( texDim == Latte::E_DIM::DIM_CUBEMAP ) + { + // 2 coords + faceId + cemu_assert_debug(texInstruction->textureFetch.srcSel[0] < 4); + cemu_assert_debug(texInstruction->textureFetch.srcSel[1] < 4); + src->add("vec4("); + src->addFmt("redcCUBEReverse({},", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0)); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + src->addFmt(",cubeMapArrayIndex{})", texInstruction->textureFetch.textureIndex); // cubemap index + } + else if( texDim == Latte::E_DIM::DIM_1D ) + { + // 1 coord + src->add(_getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], -1, -1, -1, tempBuffer0)); + } + else + { + // 2 coords + src->add("vec2("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + // avoid truncate to effectively round downwards on texel edges + if (ActiveSettings::ForceSamplerRoundToPrecision()) + src->addFmt("+ vec2(1.0)/vec2(textureSize(tex{}, 0))/512.0", texInstruction->textureFetch.textureIndex); + } + // lod or lod bias parameter + if( texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LB || texOpcode == GPU7_TEX_INST_SAMPLE_C_L) + { + src->add(","); + if(texOpcode == GPU7_TEX_INST_SAMPLE_LB) + src->add(_FormatFloatAsConstant((float)texInstruction->textureFetch.lodBias / 16.0f)); + else + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 3, LATTE_DECOMPILER_DTYPE_FLOAT); + } + else if( texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ ) + { + src->add(",0.0"); + } + } + // gradient parameters + if (texOpcode == GPU7_TEX_INST_SAMPLE_G) + { + if (texDim == Latte::E_DIM::DIM_2D || + texDim == Latte::E_DIM::DIM_1D ) + { + src->add(",gradH.xy,gradV.xy"); + } + else + { + cemu_assert_unimplemented(); + } + } + // offset + if( texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ || texOpcode == GPU7_TEX_INST_SAMPLE || texOpcode == GPU7_TEX_INST_SAMPLE_C ) + { + if( hasOffset ) + { + uint8 offsetComponentCount = 0; + if( texDim == Latte::E_DIM::DIM_1D ) + offsetComponentCount = 1; + else if( texDim == Latte::E_DIM::DIM_2D ) + offsetComponentCount = 2; + else if( texDim == Latte::E_DIM::DIM_3D ) + offsetComponentCount = 3; + else if( texDim == Latte::E_DIM::DIM_2D_ARRAY ) + offsetComponentCount = 2; + else + cemu_assert_unimplemented(); + + if( (texInstruction->textureFetch.offsetX&1) ) + cemu_assert_unimplemented(); + if( (texInstruction->textureFetch.offsetY&1) ) + cemu_assert_unimplemented(); + if ((texInstruction->textureFetch.offsetZ & 1)) + cemu_assert_unimplemented(); + + if( offsetComponentCount == 1 ) + src->addFmt(",{}", texInstruction->textureFetch.offsetX/2); + else if( offsetComponentCount == 2 ) + src->addFmt(",ivec2({},{})", texInstruction->textureFetch.offsetX/2, texInstruction->textureFetch.offsetY/2, texInstruction->textureFetch.offsetZ/2); + else if( offsetComponentCount == 3 ) + src->addFmt(",ivec3({},{},{})", texInstruction->textureFetch.offsetX/2, texInstruction->textureFetch.offsetY/2, texInstruction->textureFetch.offsetZ/2); + } + } + // lod bias + if( texOpcode == GPU7_TEX_INST_SAMPLE_C || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ ) + { + src->add(")"); + + if (numWrittenElements > 1) + { + // result is copied into multiple channels + src->add("."); + for (sint32 f = 0; f < numWrittenElements; f++) + { + cemu_assert_debug(texInstruction->dstSel[f] == 0); // only x component is defined + src->add("x"); + } + } + } + else + { + src->add(")."); + for (sint32 f = 0; f < 4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + uint8 elemIndex = texInstruction->dstSel[f]; + if (texOpcode == GPU7_TEX_INST_FETCH4) + { + // 's textureGather() and GPU7's FETCH4 instruction have a different order of elements + // xyzw: top-left, top-right, bottom-right, bottom-left + // textureGather xyzw + // fetch4 yzxw + // translate index from fetch4 to textureGather order + static uint8 fetchToGather[4] = + { + 2, // x -> z + 0, // y -> x + 1, // z -> y + 3, // w -> w + }; + elemIndex = fetchToGather[elemIndex]; + } + src->add(resultElemTable[elemIndex]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + } + src->add(");"); + + // debug +#ifdef CEMU_DEBUG_ASSERT + if(texInstruction->opcode == GPU7_TEX_INST_LD ) + src->add(" // TEX_INST_LD"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE ) + src->add(" // TEX_INST_SAMPLE"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE_L ) + src->add(" // TEX_INST_SAMPLE_L"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE_LZ ) + src->add(" // TEX_INST_SAMPLE_LZ"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE_C ) + src->add(" // TEX_INST_SAMPLE_C"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE_G ) + src->add(" // TEX_INST_SAMPLE_G"); + else + src->addFmt(" // 0x{:02x}", texInstruction->opcode); + if (texInstruction->opcode != texOpcode) + src->addFmt(" (applied as 0x{:02x})", texOpcode); + src->addFmt(" OffsetXYZ {:02x} {:02x} {:02x}", (uint8)texInstruction->textureFetch.offsetX&0xFF, (uint8)texInstruction->textureFetch.offsetY&0xFF, (uint8)texInstruction->textureFetch.offsetZ&0xFF); +#endif + src->add("" _CRLF); +} + +static void _emitTEXGetTextureResInfoCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->addFmt("R{}", texInstruction->dstGpr); + src->add("i"); + src->add("."); + + const char* resultElemTable[4] = {"x","y","z","w"}; + sint32 numWrittenElements = 0; + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + + // todo - mip index parameter? + + auto texDim = shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex]; + + if (texDim == Latte::E_DIM::DIM_1D) + src->addFmt(" = ivec4(textureSize(tex{}, 0),1,1,1).", texInstruction->textureFetch.textureIndex); + else if (texDim == Latte::E_DIM::DIM_1D_ARRAY) + src->addFmt(" = ivec4(textureSize(tex{}, 0),1,1).", texInstruction->textureFetch.textureIndex); + else if (texDim == Latte::E_DIM::DIM_2D || texDim == Latte::E_DIM::DIM_2D_MSAA) + src->addFmt(" = ivec4(textureSize(tex{}, 0),1,1).", texInstruction->textureFetch.textureIndex); + else if (texDim == Latte::E_DIM::DIM_2D_ARRAY) + src->addFmt(" = ivec4(textureSize(tex{}, 0),1).", texInstruction->textureFetch.textureIndex); + else + { + cemu_assert_debug(false); + src->addFmt(" = ivec4(textureSize(tex{}, 0),1,1).", texInstruction->textureFetch.textureIndex); + } + + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[texInstruction->dstSel[f]]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + src->add(";" _CRLF); +} + +static void _emitTEXGetCompTexLodCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + + const char* resultElemTable[4] = {"x","y","z","w"}; + sint32 numWrittenElements = 0; + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType); + + if( shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex] == Latte::E_DIM::DIM_CUBEMAP ) + { + // 3 coordinates + if(shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("vec4(textureQueryLod(tex{}, {}.{}{}{}),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); + else + src->addFmt("vec4(textureQueryLod(tex{}, intBitsToFloat({}.{}{}{})),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); + } + else + { + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("vec4(textureQueryLod(tex{}, {}.{}{}),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); + else + src->addFmt("vec4(textureQueryLod(tex{}, intBitsToFloat({}.{}{})),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); + debugBreakpoint(); + } + + + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType); + src->add("."); + + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[texInstruction->dstSel[f]]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + src->add(";" _CRLF); +} + +static void _emitTEXSetCubemapIndexCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->addFmt("cubeMapArrayIndex{}", texInstruction->textureFetch.textureIndex); + const char* resultElemTable[4] = {"x","y","z","w"}; + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt(" = intBitsToFloat(R{}i.{});" _CRLF, texInstruction->srcGpr, resultElemTable[texInstruction->textureFetch.srcSel[0]]); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt(" = R{}f.{};" _CRLF, texInstruction->srcGpr, resultElemTable[texInstruction->textureFetch.srcSel[0]]); + else + cemu_assert_unimplemented(); +} + +static void _emitTEXGetGradientsHV(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 componentCount = 0; + for (sint32 i = 0; i < 4; i++) + { + if(texInstruction->dstSel[i] == 7) + continue; + componentCount++; + } + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + const char* resultElemTable[4] = { "x","y","z","w" }; + sint32 numWrittenElements = 0; + for (sint32 f = 0; f < 4; f++) + { + if (texInstruction->dstSel[f] < 4) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if (texInstruction->dstSel[f] == 7) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + + const char* funcName; + if (texInstruction->opcode == GPU7_TEX_INST_GET_GRADIENTS_H) + funcName = "dFdx"; + else + funcName = "dFdy"; + + src->add(" = "); + + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType); + + src->addFmt("{}(", funcName); + _emitRegisterAccessCode(shaderContext, texInstruction->srcGpr, (componentCount >= 1) ? texInstruction->textureFetch.srcSel[0] : -1, (componentCount >= 2) ? texInstruction->textureFetch.srcSel[1] : -1, (componentCount >= 3) ? texInstruction->textureFetch.srcSel[2] : -1, (componentCount >= 4)?texInstruction->textureFetch.srcSel[3]:-1, LATTE_DECOMPILER_DTYPE_FLOAT); + + src->add(")"); + + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType); + + src->add(";" _CRLF); + +} + +static void _emitTEXSetGradientsHV(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + if (texInstruction->opcode == GPU7_TEX_INST_SET_GRADIENTS_H) + src->add("gradH = "); + else + src->add("gradV = "); + + _emitRegisterAccessCode(shaderContext, texInstruction->srcGpr, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], texInstruction->textureFetch.srcSel[2], texInstruction->textureFetch.srcSel[3], LATTE_DECOMPILER_DTYPE_FLOAT); + + src->add(";" _CRLF); +} + +static void _emitGSReadInputVFetchCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + + src->add("."); + + const char* resultElemTable[4] = {"x","y","z","w"}; + sint32 numWrittenElements = 0; + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType); + src->add("(v2g["); + if (texInstruction->textureFetch.srcSel[0] >= 4) + cemu_assert_unimplemented(); + if (texInstruction->textureFetch.srcSel[1] >= 4) + cemu_assert_unimplemented(); + // todo: Index type + src->add("0"); + src->addFmt("].passV2GParameter{}.", texInstruction->textureFetch.offset/16); + + + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[texInstruction->dstSel[f]]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType); + src->add(";" _CRLF); +} + +static sint32 _writeDestMaskXYZW(LatteDecompilerShaderContext* shaderContext, sint8* dstSel) +{ + StringBuf* src = shaderContext->shaderSource; + const char* resultElemTable[4] = { "x","y","z","w" }; + sint32 numWrittenElements = 0; + for (sint32 f = 0; f < 4; f++) + { + if (dstSel[f] < 4) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if (dstSel[f] == 7) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + return numWrittenElements; +} + +static void _emitTEXVFetchCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + // handle special case where geometry shader reads input attributes from vertex shader via ringbuffer + StringBuf* src = shaderContext->shaderSource; + if( texInstruction->textureFetch.textureIndex == 0x9F && shaderContext->shaderType == LatteConst::ShaderType::Geometry ) + { + _emitGSReadInputVFetchCode(shaderContext, texInstruction); + return; + } + + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + + _writeDestMaskXYZW(shaderContext, texInstruction->dstSel); + const char* resultElemTable[4] = {"x","y","z","w"}; + + src->add(" = "); + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->add("floatBitsToInt("); + else + src->add("("); + + src->addFmt("ubuff{}[", texInstruction->textureFetch.textureIndex - 0x80); + + if( shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + src->addFmt("{}.{}", _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]]); + else + src->addFmt("floatBitsToInt({}.{})", _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]]); + src->add("]."); + + + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[texInstruction->dstSel[f]]); + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + src->add(");" _CRLF); +} + +static void _emitTEXReadMemCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + sint32 count = _writeDestMaskXYZW(shaderContext, texInstruction->dstSel); + + src->add(" = "); + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->add("floatBitsToInt("); + else + src->add("("); + + sint32 readCount; + + if (texInstruction->memRead.format == FMT_32_FLOAT) + { + readCount = 1; + // todo + src->add("0.0"); + } + else if (texInstruction->memRead.format == FMT_32_32_FLOAT) + { + readCount = 2; + // todo + src->add("vec2(0.0,0.0)"); + } + else if (texInstruction->memRead.format == FMT_32_32_32_FLOAT) + { + readCount = 3; + // todo + src->add("vec3(0.0,0.0,0.0)"); + } + else + { + cemu_assert_unimplemented(); + } + if (count < readCount) + { + if (count == 1) + src->add(".x"); + else if (count == 2) + src->add(".xy"); + else if (count == 3) + src->add(".xyz"); + } + src->add(");" _CRLF); +} + +static void _emitTEXClauseCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + cemu_assert_debug(cfInstruction->instructionsALU.empty()); + for(auto& texInstruction : cfInstruction->instructionsTEX) + { + if( texInstruction.opcode == GPU7_TEX_INST_SAMPLE || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_L || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LB || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LZ || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_L || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_LZ || texInstruction.opcode == GPU7_TEX_INST_FETCH4 || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_G || texInstruction.opcode == GPU7_TEX_INST_LD ) + _emitTEXSampleTextureCode(shaderContext, &texInstruction); + else if( texInstruction.opcode == GPU7_TEX_INST_GET_TEXTURE_RESINFO ) + _emitTEXGetTextureResInfoCode(shaderContext, &texInstruction); + else if( texInstruction.opcode == GPU7_TEX_INST_GET_COMP_TEX_LOD ) + _emitTEXGetCompTexLodCode(shaderContext, &texInstruction); + else if( texInstruction.opcode == GPU7_TEX_INST_SET_CUBEMAP_INDEX ) + _emitTEXSetCubemapIndexCode(shaderContext, &texInstruction); + else if (texInstruction.opcode == GPU7_TEX_INST_GET_GRADIENTS_H || + texInstruction.opcode == GPU7_TEX_INST_GET_GRADIENTS_V) + _emitTEXGetGradientsHV(shaderContext, &texInstruction); + else if (texInstruction.opcode == GPU7_TEX_INST_SET_GRADIENTS_H || + texInstruction.opcode == GPU7_TEX_INST_SET_GRADIENTS_V) + _emitTEXSetGradientsHV(shaderContext, &texInstruction); + else if (texInstruction.opcode == GPU7_TEX_INST_VFETCH) + _emitTEXVFetchCode(shaderContext, &texInstruction); + else if (texInstruction.opcode == GPU7_TEX_INST_MEM) + _emitTEXReadMemCode(shaderContext, &texInstruction); + else + cemu_assert_unimplemented(); + } +} + +// generate the code for reading the source input GPR (or constants) for exports +static void _emitExportGPRReadCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, sint32 requiredType, uint32 burstIndex) +{ + StringBuf* src = shaderContext->shaderSource; + uint32 numOutputs = 4; + if( cfInstruction->type == GPU7_CF_INST_MEM_RING_WRITE ) + { + numOutputs = (cfInstruction->memWriteCompMask&1)?1:0; + numOutputs += (cfInstruction->memWriteCompMask&2)?1:0; + numOutputs += (cfInstruction->memWriteCompMask&4)?1:0; + numOutputs += (cfInstruction->memWriteCompMask&8)?1:0; + } + if (requiredType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + if(numOutputs == 1) + src->add("float("); + else + src->addFmt("vec{}(", numOutputs); + } + else if (requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + if (numOutputs == 1) + src->add("int("); + else + src->addFmt("ivec{}(", numOutputs); + } + else + cemu_assert_unimplemented(); + sint32 actualOutputs = 0; + for(sint32 i=0; i<4; i++) + { + // todo: Use type of register element based on information from type tracker (currently we assume it's always a signed integer) + uint32 exportSel = 0; + if( cfInstruction->type == GPU7_CF_INST_MEM_RING_WRITE ) + { + exportSel = i; + if( (cfInstruction->memWriteCompMask&(1<exportComponentSel[i]; + } + if( actualOutputs > 0 ) + src->add(", "); + actualOutputs++; + if( exportSel < 4 ) + { + _emitRegisterAccessCode(shaderContext, cfInstruction->exportSourceGPR+burstIndex, exportSel, -1, -1, -1, requiredType); + } + else if (exportSel == 4) + { + // constant zero + src->add("0"); + } + else if (exportSel == 5) + { + // constant one + src->add("1.0"); + } + else if( exportSel == 7 ) + { + // element masked (which means 0 is exported?) + src->add("0"); + } + else + { + cemu_assert_debug(false); + src->add("0"); + } + } + if( requiredType == LATTE_DECOMPILER_DTYPE_FLOAT ) + src->add(")"); + else if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + src->add(")"); + else + cemu_assert_unimplemented(); +} + +static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->add("// export" _CRLF); + if(shaderContext->shaderType == LatteConst::ShaderType::Vertex ) + { + if( cfInstruction->exportBurstCount != 0 ) + debugBreakpoint(); + if (cfInstruction->exportType == 1 && cfInstruction->exportArrayBase == GPU7_DECOMPILER_CF_EXPORT_BASE_POSITION) + { + // export position + // GX2 special state 0 disables rasterizer viewport offset and scaling (probably, exact mechanism is not known). Handle this here + bool hasAnyViewportScaleDisabled = + !shaderContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_SCALE_ENA() || + !shaderContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Y_SCALE_ENA() || + !shaderContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Z_SCALE_ENA(); + + if (hasAnyViewportScaleDisabled) + { + src->add("vec4 finalPos = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(";" _CRLF); + src->add("finalPos.xy = finalPos.xy * uf_windowSpaceToClipSpaceTransform - vec2(1.0,1.0);"); + src->add("SET_POSITION(finalPos);"); + } + else + { + src->add("SET_POSITION("); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(");" _CRLF); + } + } + else if (cfInstruction->exportType == 1 && cfInstruction->exportArrayBase == GPU7_DECOMPILER_CF_EXPORT_POINT_SIZE ) + { + // export gl_PointSize + if (shaderContext->analyzer.outputPointSize) + { + cemu_assert_debug(shaderContext->analyzer.writesPointSize); + src->add("gl_PointSize = ("); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(").x"); + src->add(";" _CRLF); + } + } + else if( cfInstruction->exportType == 2 && cfInstruction->exportArrayBase < 32 ) + { + // export parameter + sint32 paramIndex = cfInstruction->exportArrayBase; + uint32 vsSemanticId = _getVertexShaderOutParamSemanticId(shaderContext->contextRegisters, paramIndex); + if (vsSemanticId != 0xFF) + { + src->addFmt("passParameterSem{} = ", vsSemanticId); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(";" _CRLF); + } + else + { + src->add("// skipped export to semanticId 255" _CRLF); + } + } + else + cemu_assert_unimplemented(); + } + else if(shaderContext->shaderType == LatteConst::ShaderType::Pixel ) + { + if( cfInstruction->exportType == 0 && cfInstruction->exportArrayBase < 8 ) + { + for(uint32 i=0; i<(cfInstruction->exportBurstCount+1); i++) + { + sint32 pixelColorOutputIndex = LatteDecompiler_getColorOutputIndexFromExportIndex(shaderContext, cfInstruction->exportArrayBase+i); + // if color output is for target 0, then also handle alpha test + bool alphaTestEnable = shaderContext->contextRegistersNew->SX_ALPHA_TEST_CONTROL.get_ALPHA_TEST_ENABLE(); + auto alphaTestFunc = shaderContext->contextRegistersNew->SX_ALPHA_TEST_CONTROL.get_ALPHA_FUNC(); + if( pixelColorOutputIndex == 0 && alphaTestEnable && alphaTestFunc == Latte::E_COMPAREFUNC::NEVER ) + { + // never pass alpha test + src->add("discard;" _CRLF); + } + else if( pixelColorOutputIndex == 0 && alphaTestEnable && alphaTestFunc != Latte::E_COMPAREFUNC::ALWAYS) + { + src->add("if( (("); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, i); + src->add(").a "); + + switch( alphaTestFunc ) + { + case Latte::E_COMPAREFUNC::LESS: + src->add("<"); + break; + case Latte::E_COMPAREFUNC::EQUAL: + src->add("=="); + break; + case Latte::E_COMPAREFUNC::LEQUAL: + src->add("<="); + break; + case Latte::E_COMPAREFUNC::GREATER: + src->add(">"); + break; + case Latte::E_COMPAREFUNC::NOTEQUAL: + src->add("!="); + break; + case Latte::E_COMPAREFUNC::GEQUAL: + src->add(">="); + break; + } + src->add(" uf_alphaTestRef"); + src->add(") == false) discard;" _CRLF); + } + // pixel color output + src->addFmt("passPixelColor{} = ", pixelColorOutputIndex); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, i); + src->add(";" _CRLF); + + if( cfInstruction->exportArrayBase+i >= 8 ) + cemu_assert_unimplemented(); + } + } + else if( cfInstruction->exportType == 0 && cfInstruction->exportArrayBase == 61 ) + { + // pixel depth or gl_FragStencilRefARB + if( cfInstruction->exportBurstCount > 0 ) + cemu_assert_unimplemented(); + + if (cfInstruction->exportComponentSel[0] == 7) + { + cemu_assert_unimplemented(); // gl_FragDepth ? + } + if (cfInstruction->exportComponentSel[1] != 7) + { + cemu_assert_unimplemented(); // exporting to gl_FragStencilRefARB + } + if (cfInstruction->exportComponentSel[2] != 7) + { + cemu_assert_unimplemented(); // ukn + } + if (cfInstruction->exportComponentSel[3] != 7) + { + cemu_assert_unimplemented(); // ukn + } + + src->add("gl_FragDepth = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(".x"); + src->add(";" _CRLF); + } + else + cemu_assert_unimplemented(); + } +} + +static void _emitXYZWByMask(StringBuf* src, uint32 mask) +{ + if( (mask&(1<<0)) != 0 ) + src->add("x"); + if( (mask&(1<<1)) != 0 ) + src->add("y"); + if( (mask&(1<<2)) != 0 ) + src->add("z"); + if( (mask&(1<<3)) != 0 ) + src->add("w"); +} + +static void _emitCFRingWriteCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + // calculate parameter output (based on ring buffer output offset relative to GS unit) + uint32 bytesPerVertex = shaderContext->contextRegisters[mmSQ_GS_VERT_ITEMSIZE] * 4; + bytesPerVertex = std::max(bytesPerVertex, (uint32)1); // avoid division by zero + uint32 parameterOffset = ((cfInstruction->exportArrayBase * 4) % bytesPerVertex); + // for geometry shaders with streamout, MEM_RING_WRITE is used to pass the data to the copy shader, which then uses STREAM*_WRITE + if (shaderContext->shaderType == LatteConst::ShaderType::Geometry && shaderContext->analyzer.hasStreamoutEnable) + { + // if streamout is enabled, we generate transform feedback output code instead of the normal gs output + for (uint32 burstIndex = 0; burstIndex < (cfInstruction->exportBurstCount + 1); burstIndex++) + { + parameterOffset = ((cfInstruction->exportArrayBase * 4 + burstIndex*0x10) % bytesPerVertex); + // find matching stream write in copy shader + LatteGSCopyShaderStreamWrite_t* streamWrite = nullptr; + for (auto& it : shaderContext->parsedGSCopyShader->list_streamWrites) + { + if (it.offset == parameterOffset) + { + streamWrite = ⁢ + break; + } + } + if (streamWrite == nullptr) + { + cemu_assert_suspicious(); + return; + } + + for (sint32 i = 0; i < 4; i++) + { + if ((cfInstruction->memWriteCompMask&(1 << i)) == 0) + continue; + + if (shaderContext->options->useTFViaSSBO) + { + uint32 u32Offset = streamWrite->exportArrayBase + i; + src->addFmt("sb_buffer[sbBase{} + {}]", streamWrite->bufferIndex, u32Offset); + } + else + { + src->addFmt("sb{}[{}]", streamWrite->bufferIndex, streamWrite->exportArrayBase + i); + } + + src->add(" = "); + + _emitTypeConversionPrefixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + + src->addFmt("{}.", _getRegisterVarName(shaderContext, cfInstruction->exportSourceGPR+burstIndex)); + if (i == 0) + src->add("x"); + else if (i == 1) + src->add("y"); + else if (i == 2) + src->add("z"); + else if (i == 3) + src->add("w"); + + _emitTypeConversionSuffixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + + src->add(";" _CRLF); + } + } + return; + } + + if (shaderContext->shaderType == LatteConst::ShaderType::Vertex) + { + if (cfInstruction->memWriteElemSize != 3) + cemu_assert_unimplemented(); + if ((cfInstruction->exportArrayBase & 3) != 0) + cemu_assert_unimplemented(); + for (sint32 burstIndex = 0; burstIndex < (sint32)(cfInstruction->exportBurstCount + 1); burstIndex++) + { + src->addFmt("v2g.passV2GParameter{}.", (cfInstruction->exportArrayBase) / 4 + burstIndex); + _emitXYZWByMask(src, cfInstruction->memWriteCompMask); + src->addFmt(" = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_SIGNED_INT, burstIndex); + src->add(";" _CRLF); + } + } + else if (shaderContext->shaderType == LatteConst::ShaderType::Geometry) + { + cemu_assert_debug(cfInstruction->memWriteElemSize == 3); + //if (cfInstruction->memWriteElemSize != 3) + // debugBreakpoint(); + cemu_assert_debug((cfInstruction->exportArrayBase & 3) == 0); + + for (uint32 burstIndex = 0; burstIndex < (cfInstruction->exportBurstCount + 1); burstIndex++) + { + uint32 parameterExportType = 0; + uint32 parameterExportBase = 0; + if (LatteGSCopyShaderParser_getExportTypeByOffset(shaderContext->parsedGSCopyShader, parameterOffset + burstIndex * (cfInstruction->memWriteElemSize+1)*4, ¶meterExportType, ¶meterExportBase) == false) + { + cemu_assert_debug(false); + shaderContext->hasError = true; + return; + } + + if (parameterExportType == 1 && parameterExportBase == GPU7_DECOMPILER_CF_EXPORT_BASE_POSITION) + { + src->add("{" _CRLF); + src->addFmt("vec4 pos = vec4(0.0,0.0,0.0,1.0);" _CRLF); + src->addFmt("pos."); + _emitXYZWByMask(src, cfInstruction->memWriteCompMask); + src->addFmt(" = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, burstIndex); + src->add(";" _CRLF); + src->add("SET_POSITION(pos);" _CRLF); + src->add("}" _CRLF); + } + else if (parameterExportType == 2 && parameterExportBase < 16) + { + src->addFmt("passG2PParameter{}.", parameterExportBase); + _emitXYZWByMask(src, cfInstruction->memWriteCompMask); + src->addFmt(" = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, burstIndex); + src->add(";" _CRLF); + } + else + cemu_assert_debug(false); + } + } + else + debugBreakpoint(); // todo +} + +static void _emitStreamWriteCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + if (shaderContext->analyzer.hasStreamoutEnable == false) + { +#ifdef CEMU_DEBUG_ASSERT + src->add("// omitted streamout write" _CRLF); +#endif + return; + } + uint32 streamoutBufferIndex; + if (cfInstruction->type == GPU7_CF_INST_MEM_STREAM0_WRITE) + streamoutBufferIndex = 0; + else if (cfInstruction->type == GPU7_CF_INST_MEM_STREAM1_WRITE) + streamoutBufferIndex = 1; + else + cemu_assert_unimplemented(); + + if (shaderContext->shaderType == LatteConst::ShaderType::Vertex) + { + uint32 arraySize = cfInstruction->memWriteArraySize + 1; + + for (sint32 i = 0; i < (sint32)arraySize; i++) + { + if ((cfInstruction->memWriteCompMask&(1 << i)) == 0) + continue; + + if (shaderContext->options->useTFViaSSBO) + { + uint32 u32Offset = cfInstruction->exportArrayBase + i; + src->addFmt("sb_buffer[sbBase{} + {}]", streamoutBufferIndex, u32Offset); + } + else + { + src->addFmt("sb{}[{}]", streamoutBufferIndex, cfInstruction->exportArrayBase + i); + } + + src->add(" = "); + + _emitTypeConversionPrefixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + + src->add(_getRegisterVarName(shaderContext, cfInstruction->exportSourceGPR)); + _appendChannelAccess(src, i); + _emitTypeConversionSuffixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + + src->add(";" _CRLF); + } + } + else + cemu_assert_debug(false); +} + +static void _emitCFCall(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + uint32 subroutineAddr = cfInstruction->addr; + LatteDecompilerSubroutineInfo* subroutineInfo = nullptr; + // find subroutine + for (auto& subroutineItr : shaderContext->list_subroutines) + { + if (subroutineItr.cfAddr == subroutineAddr) + { + subroutineInfo = &subroutineItr; + break; + } + } + if (subroutineInfo == nullptr) + { + cemu_assert_debug(false); + return; + } + // inline function + if (shaderContext->isSubroutine) + { + cemu_assert_debug(false); // inlining with cascaded function calls not supported + return; + } + // init CF stack variables + src->addFmt("activeMaskStackSub{:04x}[0] = true;" _CRLF, subroutineInfo->cfAddr); + src->addFmt("activeMaskStackCSub{:04x}[0] = true;" _CRLF, subroutineInfo->cfAddr); + src->addFmt("activeMaskStackCSub{:04x}[1] = true;" _CRLF, subroutineInfo->cfAddr); + + shaderContext->isSubroutine = true; + shaderContext->subroutineInfo = subroutineInfo; + for(auto& cfInstruction : subroutineInfo->instructions) + LatteDecompiler_emitClauseCodeMSL(shaderContext, &cfInstruction, true); + shaderContext->isSubroutine = false; + shaderContext->subroutineInfo = nullptr; +} + +void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, bool isSubroutine) +{ + StringBuf* src = shaderContext->shaderSource; + + if( cfInstruction->type == GPU7_CF_INST_ALU || cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE || cfInstruction->type == GPU7_CF_INST_ALU_POP_AFTER || cfInstruction->type == GPU7_CF_INST_ALU_POP2_AFTER || cfInstruction->type == GPU7_CF_INST_ALU_BREAK || cfInstruction->type == GPU7_CF_INST_ALU_ELSE_AFTER ) + { + // emit ALU code + if (shaderContext->analyzer.modifiesPixelActiveState) + { + if(cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE) + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1 - 1)); + else + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + } + if (cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE) + { + src->addFmt("{} = {};" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth-1)); + src->addFmt("{} = {};" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth)); + } + _emitALUClauseCode(shaderContext, cfInstruction); + if( shaderContext->analyzer.modifiesPixelActiveState ) + src->add("}" _CRLF); + cemu_assert_debug(!(shaderContext->analyzer.modifiesPixelActiveState == false && cfInstruction->type != GPU7_CF_INST_ALU)); + // handle ELSE case of PUSH_BEFORE + if( cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE ) + { + src->add("else {" _CRLF); + src->addFmt("{} = false;" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth)); + src->addFmt("{} = false;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + src->add("}" _CRLF); + } + // post clause handler + if( cfInstruction->type == GPU7_CF_INST_ALU_POP_AFTER ) + { + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1 - 1), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth - 1), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth - 1)); + } + else if( cfInstruction->type == GPU7_CF_INST_ALU_POP2_AFTER ) + { + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1 - 2), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth - 2), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth - 2)); + } + else if( cfInstruction->type == GPU7_CF_INST_ALU_ELSE_AFTER ) + { + // no condition test + // pop stack + if( cfInstruction->popCount != 0 ) + debugBreakpoint(); + // else operation + src->addFmt("{} = {} == false;" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth)); + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth)); + } + } + else if( cfInstruction->type == GPU7_CF_INST_TEX ) + { + // emit TEX code + if (shaderContext->analyzer.modifiesPixelActiveState) + { + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth+1)); + } + _emitTEXClauseCode(shaderContext, cfInstruction); + if (shaderContext->analyzer.modifiesPixelActiveState) + { + src->add("}" _CRLF); + } + } + else if( cfInstruction->type == GPU7_CF_INST_EXPORT || cfInstruction->type == GPU7_CF_INST_EXPORT_DONE ) + { + // emit export code + _emitExportCode(shaderContext, cfInstruction); + } + else if( cfInstruction->type == GPU7_CF_INST_ELSE ) + { + // todo: Condition test, popCount? + src->addFmt("{} = {} == false;" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth)); + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth)); + } + else if( cfInstruction->type == GPU7_CF_INST_POP ) + { + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1 - cfInstruction->popCount), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth - cfInstruction->popCount), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth - cfInstruction->popCount)); + } + else if( cfInstruction->type == GPU7_CF_INST_LOOP_START_DX10 || + cfInstruction->type == GPU7_CF_INST_LOOP_START_NO_AL) + { + // start of loop + // if pixel is disabled, then skip loop + if (ActiveSettings::ShaderPreventInfiniteLoopsEnabled()) + { + // with iteration limit to prevent infinite loops + src->addFmt("int loopCounter{} = 0;" _CRLF, (sint32)cfInstruction->cfAddr); + src->addFmt("while( {} == true && loopCounter{} < 500 )" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), (sint32)cfInstruction->cfAddr); + src->add("{" _CRLF); + src->addFmt("loopCounter{}++;" _CRLF, (sint32)cfInstruction->cfAddr); + } + else + { + src->addFmt("while( {} == true )" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + src->add("{" _CRLF); + } + } + else if( cfInstruction->type == GPU7_CF_INST_LOOP_END ) + { + // this might not always work + if( cfInstruction->popCount != 0 ) + debugBreakpoint(); + src->add("}" _CRLF); + } + else if( cfInstruction->type == GPU7_CF_INST_LOOP_BREAK ) + { + if( cfInstruction->popCount != 0 ) + debugBreakpoint(); + if (shaderContext->analyzer.modifiesPixelActiveState) + { + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + } + // note: active stack level is set to the same level as the loop begin. popCount is ignored + src->add("break;" _CRLF); + + if (shaderContext->analyzer.modifiesPixelActiveState) + src->add("}" _CRLF); + + } + else if( cfInstruction->type == GPU7_CF_INST_MEM_STREAM0_WRITE || + cfInstruction->type == GPU7_CF_INST_MEM_STREAM1_WRITE ) + { + _emitStreamWriteCode(shaderContext, cfInstruction); + } + else if( cfInstruction->type == GPU7_CF_INST_MEM_RING_WRITE ) + { + _emitCFRingWriteCode(shaderContext, cfInstruction); + } + else if( cfInstruction->type == GPU7_CF_INST_EMIT_VERTEX ) + { + if( shaderContext->analyzer.modifiesPixelActiveState ) + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + // write point size + if (shaderContext->analyzer.outputPointSize && shaderContext->analyzer.writesPointSize == false) + src->add("gl_PointSize = uf_pointSize;" _CRLF); + // emit vertex + src->add("EmitVertex();" _CRLF); + // increment transform feedback pointer + if (shaderContext->analyzer.useSSBOForStreamout) + { + for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) + { + if (!shaderContext->output->streamoutBufferWriteMask[i]) + continue; + cemu_assert_debug((shaderContext->output->streamoutBufferStride[i] & 3) == 0); + src->addFmt("sbBase{} += {};" _CRLF, i, shaderContext->output->streamoutBufferStride[i] / 4); + } + } + + if( shaderContext->analyzer.modifiesPixelActiveState ) + src->add("}" _CRLF); + } + else if (cfInstruction->type == GPU7_CF_INST_CALL) + { + _emitCFCall(shaderContext, cfInstruction); + } + else if (cfInstruction->type == GPU7_CF_INST_RETURN) + { + // todo (handle properly) + } + else + { + cemu_assert_debug(false); + } +} + +void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderContext, StringBuf* fCStr_shaderSource) +{ + if( shaderContext->analyzer.hasRedcCUBE ) + { + fCStr_shaderSource->add("void redcCUBE(vec4 src0, vec4 src1, out vec3 stm, out int faceId)\r\n" + "{\r\n" + "// stm -> x .. s, y .. t, z .. MajorAxis*2.0\r\n" + + "vec3 inputCoord = normalize(vec3(src1.y, src1.x, src0.x));\r\n" + + "float rx = inputCoord.x;\r\n" + "float ry = inputCoord.y;\r\n" + "float rz = inputCoord.z;\r\n" + "if( abs(rx) > abs(ry) && abs(rx) > abs(rz) )\r\n" + "{\r\n" + "stm.z = rx*2.0;\r\n" + "stm.xy = vec2(ry,rz); \r\n" + "if( rx >= 0.0 )\r\n" + "{\r\n" + "faceId = 0;\r\n" + "}\r\n" + "else\r\n" + "{\r\n" + "faceId = 1;\r\n" + "}\r\n" + "}\r\n" + "else if( abs(ry) > abs(rx) && abs(ry) > abs(rz) )\r\n" + "{\r\n" + "stm.z = ry*2.0;\r\n" + "stm.xy = vec2(rx,rz); \r\n" + "if( ry >= 0.0 )\r\n" + "{\r\n" + "faceId = 2;\r\n" + "}\r\n" + "else\r\n" + "{\r\n" + "faceId = 3;\r\n" + "}\r\n" + "}\r\n" + "else //if( abs(rz) > abs(ry) && abs(rz) > abs(rx) )\r\n" + "{\r\n" + "stm.z = rz*2.0;\r\n" + "stm.xy = vec2(rx,ry); \r\n" + "if( rz >= 0.0 )\r\n" + "{\r\n" + "faceId = 4;\r\n" + "}\r\n" + "else\r\n" + "{\r\n" + "faceId = 5;\r\n" + "}\r\n" + "}\r\n" + "}\r\n"); + } + + if( shaderContext->analyzer.hasCubeMapTexture ) + { + fCStr_shaderSource->add("vec3 redcCUBEReverse(vec2 st, int faceId)\r\n" + "{\r\n" + "st.yx = st.xy;\r\n" + "vec3 v;\r\n" + "float majorAxis = 1.0;\r\n" + "if( faceId == 0 )\r\n" + "{\r\n" + "v.yz = (st-vec2(1.5))*(majorAxis*2.0);\r\n" + "v.x = 1.0;\r\n" + "}\r\n" + "else if( faceId == 1 )\r\n" + "{\r\n" + "v.yz = (st-vec2(1.5))*(majorAxis*2.0);\r\n" + "v.x = -1.0;\r\n" + "}\r\n" + "else if( faceId == 2 )\r\n" + "{\r\n" + "v.xz = (st-vec2(1.5))*(majorAxis*2.0);\r\n" + "v.y = 1.0;\r\n" + "}\r\n" + "else if( faceId == 3 )\r\n" + "{\r\n" + "v.xz = (st-vec2(1.5))*(majorAxis*2.0);\r\n" + "v.y = -1.0;\r\n" + "}\r\n" + "else if( faceId == 4 )\r\n" + "{\r\n" + "v.xy = (st-vec2(1.5))*(majorAxis*2.0);\r\n" + "v.z = 1.0;\r\n" + "}\r\n" + "else\r\n" + "{\r\n" + "v.xy = (st-vec2(1.5))*(majorAxis*2.0);\r\n" + "v.z = -1.0;\r\n" + "}\r\n" + + "return v;\r\n" + "}\r\n"); + } + + // clamp + fCStr_shaderSource->add("" + "int clampFI32(int v)\r\n" + "{\r\n" + "if( v == 0x7FFFFFFF )\r\n" + " return floatBitsToInt(1.0);\r\n" + "else if( v == 0xFFFFFFFF )\r\n" + " return floatBitsToInt(0.0);\r\n" + "return floatBitsToInt(clamp(intBitsToFloat(v), 0.0, 1.0));\r\n" + "}\r\n"); + // mul non-ieee way (0*NaN/INF => 0.0) + if (shaderContext->options->strictMul) + { + // things we tried: + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return mix(a*b,0.0,a==0.0||b==0.0); }" STR_LINEBREAK); + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return mix(vec2(a*b,0.0),vec2(0.0,0.0),(equal(vec2(a),vec2(0.0,0.0))||equal(vec2(b),vec2(0.0,0.0)))).x; }" STR_LINEBREAK); + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ if( a == 0.0 || b == 0.0 ) return 0.0; return a*b; }" STR_LINEBREAK); + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){float r = a*b;r = intBitsToFloat(floatBitsToInt(r)&(((floatBitsToInt(a) != 0) && (floatBitsToInt(b) != 0))?0xFFFFFFFF:0));return r;}" STR_LINEBREAK); works + + // for "min" it used to be: float mul_nonIEEE(float a, float b){ return min(a*b,min(abs(a)*3.40282347E+38F,abs(b)*3.40282347E+38F)); } + + if( LatteGPUState.glVendor == GLVENDOR_NVIDIA && !ActiveSettings::DumpShadersEnabled()) + fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){return mix(0.0, a*b, (a != 0.0) && (b != 0.0));}" _CRLF); // compiles faster on Nvidia and also results in lower RAM usage (OpenGL) + else + fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ if( a == 0.0 || b == 0.0 ) return 0.0; return a*b; }" _CRLF); + + // DXKV-like: fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return (b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b); }" _CRLF); + } +} + +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp" + +static void LatteDecompiler_emitAttributeImport(LatteDecompilerShaderContext* shaderContext, LatteParsedFetchShaderAttribute_t& attrib) +{ + auto src = shaderContext->shaderSource; + + static const char* dsMappingTableFloat[6] = { "int(attrDecoder.x)", "int(attrDecoder.y)", "int(attrDecoder.z)", "int(attrDecoder.w)", /*"floatBitsToInt(0.0)"*/ "0", /*"floatBitsToInt(1.0)"*/ "0x3f800000" }; + static const char* dsMappingTableInt[6] = { "int(attrDecoder.x)", "int(attrDecoder.y)", "int(attrDecoder.z)", "int(attrDecoder.w)", "0", "1" }; + + // get register index based on vtx semantic table + uint32 attributeShaderLoc = 0xFFFFFFFF; + for (sint32 f = 0; f < 32; f++) + { + if (shaderContext->contextRegisters[mmSQ_VTX_SEMANTIC_0 + f] == attrib.semanticId) + { + attributeShaderLoc = f; + break; + } + } + if (attributeShaderLoc == 0xFFFFFFFF) + return; // attribute is not mapped to VS input + uint32 registerIndex = attributeShaderLoc + 1; // R0 is skipped + // is register used? + if ((shaderContext->analyzer.gprUseMask[registerIndex / 8] & (1 << (registerIndex % 8))) == 0) + { + src->addFmt("// skipped unused attribute for r{}" _CRLF, registerIndex); + return; + } + + LatteDecompiler_emitAttributeDecodeMSL(shaderContext->shader, src, &attrib); + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{} = ivec4(", _getRegisterVarName(shaderContext, registerIndex)); + else + src->addFmt("{} = vec4(", _getRegisterVarName(shaderContext, registerIndex)); + for (sint32 f = 0; f < 4; f++) + { + uint8 ds = attrib.ds[f]; + if (f > 0) + src->add(", "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType); + if (ds >= 6) + { + cemu_assert_unimplemented(); + ds = 4; // read as 0.0 + } + if (attrib.nfa != 1) + { + src->add(dsMappingTableFloat[ds]); + } + else + { + src->add(dsMappingTableInt[ds]); + } + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType); + } + src->add(");" _CRLF); +} + +void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader) +{ + StringBuf* src = new StringBuf(1024*1024*12); // reserve 12MB for generated source (we resize-to-fit at the end) + shaderContext->shaderSource = src; + + // debug info + src->addFmt("// shader {:016x}" _CRLF, shaderContext->shaderBaseHash); +#ifdef CEMU_DEBUG_ASSERT + src->addFmt("// usesIntegerValues: {}" _CRLF, shaderContext->analyzer.usesIntegerValues?"true":"false"); + src->addFmt(_CRLF); +#endif + // header part (definitions for inputs and outputs) + LatteDecompiler::emitHeader(shaderContext); + // helper functions + LatteDecompiler_emitHelperFunctions(shaderContext, src); + // start of main + src->add("void main()" _CRLF); + src->add("{" _CRLF); + // variable definition + if (shaderContext->typeTracker.useArrayGPRs == false) + { + // each register is a separate variable + for (sint32 i = 0; i < 128; i++) + { + if (shaderContext->analyzer.usesRelativeGPRRead || (shaderContext->analyzer.gprUseMask[i / 8] & (1 << (i & 7))) != 0) + { + if (shaderContext->typeTracker.genIntReg) + src->addFmt("int4 R{}i = int4(0);" _CRLF, i); + else if (shaderContext->typeTracker.genFloatReg) + src->addFmt("float4 R{}f = float4(0.0);" _CRLF, i); + } + } + } + else + { + // registers are represented using a single large array + if (shaderContext->typeTracker.genIntReg) + src->addFmt("int4 Ri[128];" _CRLF); + else if (shaderContext->typeTracker.genFloatReg) + src->addFmt("float4 Rf[128];" _CRLF); + for (sint32 i = 0; i < 128; i++) + { + if (shaderContext->typeTracker.genIntReg) + src->addFmt("Ri[{}] = int4(0);" _CRLF, i); + else if (shaderContext->typeTracker.genFloatReg) + src->addFmt("Rf[{}] = float4(0.0);" _CRLF, i); + } + } + + if( shader->shaderType == LatteConst::ShaderType::Vertex ) + src->addFmt("uint4 attrDecoder;" _CRLF); + if (shaderContext->typeTracker.genIntReg) + src->addFmt("int backupReg0i, backupReg1i, backupReg2i, backupReg3i, backupReg4i;" _CRLF); + if (shaderContext->typeTracker.genFloatReg) + src->addFmt("float backupReg0f, backupReg1f, backupReg2f, backupReg3f, backupReg4f;" _CRLF); + if (shaderContext->typeTracker.genIntReg) + { + src->addFmt("int PV0ix = 0, PV0iy = 0, PV0iz = 0, PV0iw = 0, PV1ix = 0, PV1iy = 0, PV1iz = 0, PV1iw = 0;" _CRLF); + src->addFmt("int PS0i = 0, PS1i = 0;" _CRLF); + src->addFmt("int4 tempi = int4(0);" _CRLF); + } + if (shaderContext->typeTracker.genFloatReg) + { + src->addFmt("float PV0fx = 0.0, PV0fy = 0.0, PV0fz = 0.0, PV0fw = 0.0, PV1fx = 0.0, PV1fy = 0.0, PV1fz = 0.0, PV1fw = 0.0;" _CRLF); + src->addFmt("float PS0f = 0.0, PS1f = 0.0;" _CRLF); + src->addFmt("float4 tempf = float4(0.0);" _CRLF); + } + if (shaderContext->analyzer.hasGradientLookup) + { + src->add("float4 gradH;" _CRLF); + src->add("float4 gradV;" _CRLF); + } + src->add("float tempResultf;" _CRLF); + src->add("int tempResulti;" _CRLF); + src->add("int4 ARi = int4(0);" _CRLF); + src->add("bool predResult = true;" _CRLF); + if(shaderContext->analyzer.modifiesPixelActiveState ) + { + src->addFmt("bool activeMaskStack[{}];" _CRLF, shaderContext->analyzer.activeStackMaxDepth+1); + src->addFmt("bool activeMaskStackC[{}];" _CRLF, shaderContext->analyzer.activeStackMaxDepth+2); + for (sint32 i = 0; i < shaderContext->analyzer.activeStackMaxDepth; i++) + { + src->addFmt("activeMaskStack[{}] = false;" _CRLF, i); + } + for (sint32 i = 0; i < shaderContext->analyzer.activeStackMaxDepth+1; i++) + { + src->addFmt("activeMaskStackC[{}] = false;" _CRLF, i); + } + src->addFmt("activeMaskStack[0] = true;" _CRLF); + src->addFmt("activeMaskStackC[0] = true;" _CRLF); + src->addFmt("activeMaskStackC[1] = true;" _CRLF); + // generate vars for each subroutine + for (auto& subroutineInfo : shaderContext->list_subroutines) + { + sint32 subroutineMaxStackDepth = 0; + src->addFmt("bool activeMaskStackSub{:04x}[{}];" _CRLF, subroutineInfo.cfAddr, subroutineMaxStackDepth + 1); + src->addFmt("bool activeMaskStackCSub{:04x}[{}];" _CRLF, subroutineInfo.cfAddr, subroutineMaxStackDepth + 2); + } + } + // helper variables for cube maps (todo: Only emit when used) + if (shaderContext->analyzer.hasRedcCUBE) + { + src->add("float3 cubeMapSTM;" _CRLF); + src->add("int cubeMapFaceId;" _CRLF); + } + for(sint32 i=0; ioutput->textureUnitMask[i]) + continue; + if( shader->textureUnitDim[i] != Latte::E_DIM::DIM_CUBEMAP ) + continue; + src->addFmt("float cubeMapArrayIndex{} = 0.0;" _CRLF, i); + } + // init base offset for streamout buffer writes + if (shaderContext->analyzer.useSSBOForStreamout && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)) + { + for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) + { + if(!shaderContext->output->streamoutBufferWriteMask[i]) + continue; + + cemu_assert_debug((shaderContext->output->streamoutBufferStride[i]&3) == 0); + + if (shader->shaderType == LatteConst::ShaderType::Vertex) // vertex shader + src->addFmt("int sbBase{} = uf_streamoutBufferBase{}/4 + (gl_VertexID + uf_verticesPerInstance * gl_InstanceID)*{};" _CRLF, i, i, shaderContext->output->streamoutBufferStride[i] / 4); + else // geometry shader + { + uint32 gsOutPrimType = shaderContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE]; + uint32 bytesPerVertex = shaderContext->contextRegisters[mmSQ_GS_VERT_ITEMSIZE] * 4; + uint32 maxVerticesInGS = ((shaderContext->contextRegisters[mmSQ_GSVS_RING_ITEMSIZE] & 0x7FFF) * 4) / bytesPerVertex; + + cemu_assert_debug(gsOutPrimType == 0); // currently we only properly handle GS output primitive points + + src->addFmt("int sbBase{} = uf_streamoutBufferBase{}/4 + (gl_PrimitiveIDIn * {})*{};" _CRLF, i, i, maxVerticesInGS, shaderContext->output->streamoutBufferStride[i] / 4); + } + } + + } + // code to load inputs from previous stage + if( shader->shaderType == LatteConst::ShaderType::Vertex ) + { + if( (shaderContext->analyzer.gprUseMask[0/8]&(1<<(0%8))) != 0 ) + { + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{} = int4(gl_VertexID, 0, 0, gl_InstanceID);" _CRLF, _getRegisterVarName(shaderContext, 0)); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = as_type(float4(gl_VertexID, 0, 0, gl_InstanceID));" _CRLF, _getRegisterVarName(shaderContext, 0)); // TODO: is this correct? + else + cemu_assert_unimplemented(); + } + + LatteFetchShader* parsedFetchShader = shaderContext->fetchShader; + for(auto& bufferGroup : parsedFetchShader->bufferGroups) + { + for(sint32 i=0; ibufferGroupsInvalid) + { + // these attributes point to non-existent buffers + // todo - figure out how the hardware actually handles this, currently we assume the input values are zero + for (sint32 i = 0; i < bufferGroup.attribCount; i++) + LatteDecompiler_emitAttributeImport(shaderContext, bufferGroup.attrib[i]); + } + } + else if (shader->shaderType == LatteConst::ShaderType::Pixel) + { + LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); + + uint32 psControl0 = shaderContext->contextRegisters[mmSPI_PS_IN_CONTROL_0]; + uint32 psControl1 = shaderContext->contextRegisters[mmSPI_PS_IN_CONTROL_1]; + + uint32 spiInterpControl = shaderContext->contextRegisters[mmSPI_INTERP_CONTROL_0]; + uint8 spriteEnable = (spiInterpControl >> 1) & 1; + cemu_assert_debug(spriteEnable == 0); + + uint8 frontFace_enabled = (psControl1 >> 8) & 1; + uint8 frontFace_chan = (psControl1 >> 9) & 3; + uint8 frontFace_allBits = (psControl1 >> 11) & 1; + uint8 frontFace_regIndex = (psControl1 >> 12) & 0x1F; + + // handle param_gen + if (psInputTable->paramGen != 0) + { + cemu_assert_debug((psInputTable->paramGen) == 1); // handle the other bits (the same set of coordinates with different perspective/projection settings?) + uint32 paramGenGPRIndex = psInputTable->paramGenGPR; + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = in.position.xyxy;" _CRLF, _getRegisterVarName(shaderContext, paramGenGPRIndex)); + else + src->addFmt("{} = as_type(gl_PointCoord.xyxy);" _CRLF, _getRegisterVarName(shaderContext, paramGenGPRIndex)); + } + + for (sint32 i = 0; i < psInputTable->count; i++) + { + uint32 psControl0 = shaderContext->contextRegisters[mmSPI_PS_IN_CONTROL_0]; + uint32 spi0_paramGen = (psControl0 >> 15) & 0xF; + + sint32 gprIndex = i;// +spi0_paramGen + paramRegOffset; + if ((shaderContext->analyzer.gprUseMask[gprIndex / 8] & (1 << (gprIndex % 8))) == 0 && shaderContext->analyzer.usesRelativeGPRRead == false) + continue; + uint32 psInputSemanticId = psInputTable->import[i].semanticId; + if (psInputSemanticId == LATTE_ANALYZER_IMPORT_INDEX_SPIPOSITION) + { + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = GET_FRAGCOORD();" _CRLF, _getRegisterVarName(shaderContext, gprIndex)); + else + src->addFmt("{} = as_type(GET_FRAGCOORD());" _CRLF, _getRegisterVarName(shaderContext, gprIndex)); + continue; + } + + if (shaderContext->options->usesGeometryShader) + { + // import from geometry shader + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{} = asy_type(passG2PParameter{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId & 0x7F); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = passG2PParameter{};" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId & 0x7F); + else + cemu_assert_unimplemented(); + } + else + { + // import from vertex shader + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{} = as_type(passParameterSem{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = passParameterSem{};" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); + else + cemu_assert_unimplemented(); + } + } + // front facing attribute + if (frontFace_enabled) + { + if ((shaderContext->analyzer.gprUseMask[0 / 8] & (1 << (0 % 8))) != 0) + { + if (frontFace_allBits) + cemu_assert_debug(false); + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{}.{} = as_type(gl_FrontFacing?1.0:0.0);" _CRLF, _getRegisterVarName(shaderContext, frontFace_regIndex), _getElementStrByIndex(frontFace_chan)); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{}.{} = frontFacing ? 1.0 : 0.0;" _CRLF, _getRegisterVarName(shaderContext, frontFace_regIndex), _getElementStrByIndex(frontFace_chan)); + else + cemu_assert_debug(false); + } + } + } + for(auto& cfInstruction : shaderContext->cfInstructions) + LatteDecompiler_emitClauseCodeMSL(shaderContext, &cfInstruction, false); + if( shader->shaderType == LatteConst::ShaderType::Geometry ) + src->add("EndPrimitive();" _CRLF); + // vertex shader should write renderstate point size at the end if required but not modified by shader + if (shaderContext->analyzer.outputPointSize && shaderContext->analyzer.writesPointSize == false) + { + if (shader->shaderType == LatteConst::ShaderType::Vertex && shaderContext->options->usesGeometryShader == false) + src->add("out.pointSize = uf_pointSize;" _CRLF); + } + // end of shader main + src->add("}" _CRLF); + src->shrink_to_fit(); + shader->strBuf_shaderSource = src; +} diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp new file mode 100644 index 00000000..8219646a --- /dev/null +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp @@ -0,0 +1,508 @@ +#include "Cafe/HW/Latte/Core/LatteConst.h" +#include "Cafe/HW/Latte/Core/LatteShaderAssembly.h" +#include "Cafe/HW/Latte/ISA/RegDefines.h" +#include "Cafe/HW/Latte/Core/Latte.h" +#include "Cafe/HW/Latte/Core/LatteDraw.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" +#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "util/helpers/StringBuf.h" + +#define _CRLF "\r\n" + +static void _readLittleEndianAttributeU32x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = attrDataSem{};" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU32x3(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = uint4(attrDataSem{}.xyz,0);" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU32x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = uint4(attrDataSem{}.xy,0,0);" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU32x1(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = uint4(attrDataSem{}.x,0,0,0);" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU16x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = uint4(attrDataSem{}.xy,0,0);" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU16x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = attrDataSem{};" _CRLF, attributeInputIndex); +} + +static void _readBigEndianAttributeU32x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = attrDataSem{};" _CRLF, attributeInputIndex); + src->add("attrDecoder = (attrDecoder>>24)|((attrDecoder>>8)&0xFF00)|((attrDecoder<<8)&0xFF0000)|((attrDecoder<<24));" _CRLF); +} + +static void _readBigEndianAttributeU32x3(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xyz = attrDataSem{}.xyz;" _CRLF, attributeInputIndex); + src->add("attrDecoder.xyz = (attrDecoder.xyz>>24)|((attrDecoder.xyz>>8)&0xFF00)|((attrDecoder.xyz<<8)&0xFF0000)|((attrDecoder.xyz<<24));" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU32x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xy = attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->add("attrDecoder.xy = (attrDecoder.xy>>24)|((attrDecoder.xy>>8)&0xFF00)|((attrDecoder.xy<<8)&0xFF0000)|((attrDecoder.xy<<24));" _CRLF); + src->add("attrDecoder.z = 0;" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU32x1(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.x = attrDataSem{}.x;" _CRLF, attributeInputIndex); + src->add("attrDecoder.x = (attrDecoder.x>>24)|((attrDecoder.x>>8)&0xFF00)|((attrDecoder.x<<8)&0xFF0000)|((attrDecoder.x<<24));" _CRLF); + src->add("attrDecoder.y = 0;" _CRLF); + src->add("attrDecoder.z = 0;" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU16x1(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xy = attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->add("attrDecoder.x = ((attrDecoder.x>>8)&0xFF)|((attrDecoder.x<<8)&0xFF00);" _CRLF); + src->add("attrDecoder.y = 0;" _CRLF); + src->add("attrDecoder.z = 0;" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU16x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xy = attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->add("attrDecoder.xy = ((attrDecoder.xy>>8)&0xFF)|((attrDecoder.xy<<8)&0xFF00);" _CRLF); + src->add("attrDecoder.z = 0;" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU16x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xyzw = attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + src->add("attrDecoder = ((attrDecoder>>8)&0xFF)|((attrDecoder<<8)&0xFF00);" _CRLF); +} + +void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext, StringBuf* src, LatteParsedFetchShaderAttribute_t* attrib) +{ + if (attrib->attributeBufferIndex >= Latte::GPU_LIMITS::NUM_VERTEX_BUFFERS) + { + src->add("attrDecoder = int4(0);" _CRLF); + return; + } + + uint32 attributeInputIndex = attrib->semanticId; + if( attrib->endianSwap == LatteConst::VertexFetchEndianMode::SWAP_U32 ) + { + if( attrib->format == FMT_32_32_32_32_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU32x4(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_32_32_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU32x3(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_32_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU32x2(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_2_10_10_10 && attrib->nfa == 0 ) + { + _readBigEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + // Bayonetta 2 uses this format to store normals + src->add("attrDecoder.xyzw = uint4((attrDecoder.x>>0)&0x3FF,(attrDecoder.x>>10)&0x3FF,(attrDecoder.x>>20)&0x3FF,(attrDecoder.x>>30)&0x3);" _CRLF); + if (attrib->isSigned != 0) + { + src->add("if( (attrDecoder.x&0x200) != 0 ) attrDecoder.x |= 0xFFFFFC00;" _CRLF); + src->add("if( (attrDecoder.y&0x200) != 0 ) attrDecoder.y |= 0xFFFFFC00;" _CRLF); + src->add("if( (attrDecoder.z&0x200) != 0 ) attrDecoder.z |= 0xFFFFFC00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/511.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/511.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/511.0,-1.0));" _CRLF); + } + else + { + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/1023.0,-1.0));" _CRLF); + } + src->add("attrDecoder.w = as_type(float(attrDecoder.w));" _CRLF); // unsure? + + } + else if( attrib->format == FMT_32_32_32_32 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU32x4(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_32_32 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU32x3(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_32 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU32x2(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32 && attrib->nfa == 1 && attrib->isSigned == 0) + { + _readBigEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32 && attrib->nfa == 1 && attrib->isSigned == 1) + { + // we can just read the signed s32 as a u32 since no sign-extension is necessary + _readBigEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned == 0 ) + { + // seen in Minecraft Wii U Edition + src->addFmt("attrDecoder.xyzw = as_type(vec4(attrDataSem{}.wzyx)/255.0);" _CRLF, attributeInputIndex); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned != 0 ) + { + // seen in Minecraft Wii U Edition + src->addFmt("attrDecoder.xyzw = attrDataSem{}.wzyx;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.z&0x80) != 0 ) attrDecoder.z |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.w&0x80) != 0 ) attrDecoder.w |= 0xFFFFFF00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(max(float(int(attrDecoder.w))/127.0,-1.0));" _CRLF); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + // seen in Minecraft Wii U Edition + src->addFmt("attrDecoder.xyzw = attrDataSem{}.wzyx;" _CRLF, attributeInputIndex); + } + else if (attrib->format == FMT_8_8_8_8 && attrib->nfa == 2 && attrib->isSigned == 0) + { + // seen in Ben 10 Omniverse + src->addFmt("attrDecoder.xyzw = as_type(vec4(attrDataSem{}.wzyx));" _CRLF, attributeInputIndex); + } + else + { + cemuLog_log(LogType::Force, "_emitAttributeDecode(): Unsupported fmt {:02x} nfa {} signed {} endian {}\n", attrib->format, attrib->nfa, attrib->isSigned, attrib->endianSwap); + cemu_assert_unimplemented(); + } + } + else if( attrib->endianSwap == LatteConst::VertexFetchEndianMode::SWAP_NONE ) + { + if( attrib->format == FMT_32_32_32_32_FLOAT && attrib->nfa == 2 ) + { + _readLittleEndianAttributeU32x4(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32_32_32_FLOAT && attrib->nfa == 2) + { + _readLittleEndianAttributeU32x3(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32_32_FLOAT && attrib->nfa == 2) + { + // seen in Cities of Gold + _readLittleEndianAttributeU32x2(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32 && attrib->nfa == 1 && attrib->isSigned == 0) + { + // seen in Nano Assault Neo + _readLittleEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_2_10_10_10 && attrib->nfa == 0 && attrib->isSigned == 0) + { + // seen in Fast Racing Neo + _readLittleEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xyzw = uint4((attrDecoder.x>>0)&0x3FF,(attrDecoder.x>>10)&0x3FF,(attrDecoder.x>>20)&0x3FF,(attrDecoder.x>>30)&0x3);" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(float(attrDecoder.w));" _CRLF); // todo - is this correct? + } + else if (attrib->format == FMT_16_16_16_16 && attrib->nfa == 0 && attrib->isSigned != 0) + { + // seen in CoD ghosts + _readLittleEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(max(float(int(attrDecoder.w))/32767.0,-1.0));" _CRLF); + } + else if( attrib->format == FMT_16_16_16_16 && attrib->nfa == 2 && attrib->isSigned == 1 ) + { + // seen in Rabbids Land + _readLittleEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.xyzw = as_type(float4(int4(attrDecoder)));" _CRLF); + } + else if (attrib->format == FMT_16_16_16_16_FLOAT && attrib->nfa == 2) + { + // seen in Giana Sisters: Twisted Dreams + _readLittleEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xyzw = as_type(float4(unpackHalf2x16(attrDecoder.x|(attrDecoder.y<<16)),unpackHalf2x16(attrDecoder.z|(attrDecoder.w<<16))));" _CRLF); + } + else if (attrib->format == FMT_16_16 && attrib->nfa == 0 && attrib->isSigned != 0) + { + // seen in Nano Assault Neo + _readLittleEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/32767.0,-1.0));" _CRLF); + } + else if (attrib->format == FMT_16_16_FLOAT && attrib->nfa == 2) + { + // seen in Giana Sisters: Twisted Dreams + _readLittleEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xy = as_type(unpackHalf2x16(attrDecoder.x|(attrDecoder.y<<16)));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned == 0 ) + { + src->addFmt("attrDecoder.xyzw = as_type(float4(attrDataSem{}.xyzw)/255.0);" _CRLF, attributeInputIndex); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned != 0 ) + { + src->addFmt("attrDecoder.xyzw = attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.z&0x80) != 0 ) attrDecoder.z |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.w&0x80) != 0 ) attrDecoder.w |= 0xFFFFFF00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(max(float(int(attrDecoder.w))/127.0,-1.0));" _CRLF); + } + else if (attrib->format == FMT_8_8_8_8 && attrib->nfa == 1 && attrib->isSigned == 0) + { + src->addFmt("attrDecoder.xyzw = attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + } + else if (attrib->format == FMT_8_8_8_8 && attrib->nfa == 1 && attrib->isSigned != 0) + { + // seen in Sonic Lost World + src->addFmt("attrDecoder.xyzw = attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.z&0x80) != 0 ) attrDecoder.z |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.w&0x80) != 0 ) attrDecoder.w |= 0xFFFFFF00;" _CRLF); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 2 && attrib->isSigned == 0 ) + { + // seen in One Piece + src->addFmt("attrDecoder.xyzw = as_type(float4(attrDataSem{}.xyzw));" _CRLF, attributeInputIndex); + } + else if (attrib->format == FMT_8_8 && attrib->nfa == 0 && attrib->isSigned == 0) + { + if( (attrib->offset&3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL ) + { + // AMD workaround + src->addFmt("attrDecoder.xy = as_type(float2(attrDataSem{}.zw)/255.0);" _CRLF, attributeInputIndex); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else + { + src->addFmt("attrDecoder.xy = as_type(float2(attrDataSem{}.xy)/255.0);" _CRLF, attributeInputIndex); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + } + else if (attrib->format == FMT_8_8 && attrib->nfa == 2 && attrib->isSigned == 0) + { + // seen in BotW + if ((attrib->offset & 3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL) + { + // AMD workaround + src->addFmt("attrDecoder.xy = as_type(float2(attrDataSem{}.zw));" _CRLF, attributeInputIndex); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else + { + src->addFmt("attrDecoder.xy = as_type(float2(attrDataSem{}.xy));" _CRLF, attributeInputIndex); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + } + else if (attrib->format == FMT_8_8 && attrib->nfa == 0 && attrib->isSigned != 0) + { + if ((attrib->offset & 3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL) + { + // AMD workaround + src->addFmt("attrDecoder.xy = attrDataSem{}.zw;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else + { + src->addFmt("attrDecoder.xy = attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + } + else if (attrib->format == FMT_8_8 && attrib->nfa == 1 && attrib->isSigned == 0) + { + if ((attrib->offset & 3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL) + { + // AMD workaround + src->addFmt("attrDecoder.xyzw = uint4(attrDataSem{}.zw,0,0);" _CRLF, attributeInputIndex); + } + else + { + src->addFmt("attrDecoder.xyzw = uint4(attrDataSem{}.xy,0,0);" _CRLF, attributeInputIndex); + } + } + else if( attrib->format == FMT_8 && attrib->nfa == 0 && attrib->isSigned == 0 ) + { + // seen in Pikmin 3 + src->addFmt("attrDecoder.x = as_type(float(attrDataSem{}.x)/255.0);" _CRLF, attributeInputIndex); + src->add("attrDecoder.yzw = uint3(0);" _CRLF); + } + else if( attrib->format == FMT_8 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + src->addFmt("attrDecoder.xyzw = uint4(attrDataSem{}.x,0,0,0);" _CRLF, attributeInputIndex); + } + else + { + cemuLog_log(LogType::Force, "_emitAttributeDecode(): Unsupported fmt {:02x} nfa {} signed {} endian {}\n", attrib->format, attrib->nfa, attrib->isSigned, attrib->endianSwap); + cemu_assert_debug(false); + } + } + else if( attrib->endianSwap == LatteConst::VertexFetchEndianMode::SWAP_U16 ) + { + if( attrib->format == FMT_16_16_16_16_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xyzw = as_type(float4(unpackHalf2x16(attrDecoder.x|(attrDecoder.y<<16)),unpackHalf2x16(attrDecoder.z|(attrDecoder.w<<16))));" _CRLF); + } + else if (attrib->format == FMT_16_16_16_16 && attrib->nfa == 0 && attrib->isSigned != 0) + { + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(max(float(int(attrDecoder.w))/32767.0,-1.0));" _CRLF); + } + else if (attrib->format == FMT_16_16_16_16 && attrib->nfa == 0 && attrib->isSigned == 0) + { + // seen in BotW + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.x = as_type(float(int(attrDecoder.x))/65535.0);" _CRLF); + src->add("attrDecoder.y = as_type(float(int(attrDecoder.y))/65535.0);" _CRLF); + src->add("attrDecoder.z = as_type(float(int(attrDecoder.z))/65535.0);" _CRLF); + src->add("attrDecoder.w = as_type(float(int(attrDecoder.w))/65535.0);" _CRLF); + } + else if( attrib->format == FMT_16_16_16_16 && attrib->nfa == 2 && attrib->isSigned != 0 ) + { + // seen in Minecraft Wii U Edition + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(float(int(attrDecoder.x)));" _CRLF); + src->add("attrDecoder.y = as_type(float(int(attrDecoder.y)));" _CRLF); + src->add("attrDecoder.z = as_type(float(int(attrDecoder.z)));" _CRLF); + src->add("attrDecoder.w = as_type(float(int(attrDecoder.w)));" _CRLF); + } + else if( attrib->format == FMT_16_16_16_16 && attrib->nfa == 1 && attrib->isSigned != 0 ) + { + // seen in Minecraft Wii U Edition + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + } + else if( attrib->format == FMT_16_16_16_16 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_16_16_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xy = as_type(unpackHalf2x16(attrDecoder.x|(attrDecoder.y<<16)));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 0 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xy = as_type(float2(float(attrDecoder.x), float(attrDecoder.y))/65535.0);" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 0 && attrib->isSigned != 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 1 && attrib->isSigned != 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 2 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xy = as_type(float2(float(attrDecoder.x), float(attrDecoder.y)));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 2 && attrib->isSigned != 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.xy = as_type(uint2(float(int(attrDecoder.x)), float(int(attrDecoder.y))));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if (attrib->format == FMT_16 && attrib->nfa == 1 && attrib->isSigned == 0) + { + _readBigEndianAttributeU16x1(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_16 && attrib->nfa == 0 && attrib->isSigned == 0) + { + // seen in CoD ghosts + _readBigEndianAttributeU16x1(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.x = as_type(float(int(attrDecoder.x))/65535.0);" _CRLF); + } + else + { + cemuLog_logDebug(LogType::Force, "_emitAttributeDecode(): Unsupported fmt {:02x} nfa {} signed {} endian {}", attrib->format, attrib->nfa, attrib->isSigned, attrib->endianSwap); + } + } + else + { + cemu_assert_debug(false); + } +} diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp new file mode 100644 index 00000000..fade4775 --- /dev/null +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -0,0 +1,426 @@ +#pragma once + +namespace LatteDecompiler +{ + static void _emitUniformVariables(LatteDecompilerShaderContext* decompilerContext, LatteDecompilerOutputUniformOffsets& uniformOffsets) + { + LatteDecompilerShaderResourceMapping& resourceMapping = decompilerContext->output->resourceMappingVK; + + sint32 uniformCurrentOffset = 0; + auto shader = decompilerContext->shader; + auto shaderType = decompilerContext->shader->shaderType; + auto shaderSrc = decompilerContext->shaderSource; + if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED) + { + // uniform registers or buffers are accessed statically with predictable offsets + // this allows us to remap the used entries into a more compact array + if (shaderType == LatteConst::ShaderType::Vertex) + shaderSrc->addFmt("uniform ivec4 uf_remappedVS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); + else if (shaderType == LatteConst::ShaderType::Pixel) + shaderSrc->addFmt("uniform ivec4 uf_remappedPS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); + else if (shaderType == LatteConst::ShaderType::Geometry) + shaderSrc->addFmt("uniform ivec4 uf_remappedGS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); + else + debugBreakpoint(); + uniformOffsets.offset_remapped = uniformCurrentOffset; + uniformCurrentOffset += 16 * shader->list_remappedUniformEntries.size(); + } + else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE) + { + uint32 cfileSize = decompilerContext->analyzer.uniformRegisterAccessTracker.DetermineSize(decompilerContext->shaderBaseHash, 256); + // full or partial uniform register file has to be present + if (shaderType == LatteConst::ShaderType::Vertex) + shaderSrc->addFmt("uniform ivec4 uf_uniformRegisterVS[{}];" _CRLF, cfileSize); + else if (shaderType == LatteConst::ShaderType::Pixel) + shaderSrc->addFmt("uniform ivec4 uf_uniformRegisterPS[{}];" _CRLF, cfileSize); + else if (shaderType == LatteConst::ShaderType::Geometry) + shaderSrc->addFmt("uniform ivec4 uf_uniformRegisterGS[{}];" _CRLF, cfileSize); + uniformOffsets.offset_uniformRegister = uniformCurrentOffset; + uniformOffsets.count_uniformRegister = cfileSize; + uniformCurrentOffset += 16 * cfileSize; + } + // special uniforms + bool hasAnyViewportScaleDisabled = + !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_SCALE_ENA() || + !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Y_SCALE_ENA() || + !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Z_SCALE_ENA(); + + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex && hasAnyViewportScaleDisabled) + { + // aka GX2 special state 0 + uniformCurrentOffset = (uniformCurrentOffset + 7)&~7; + shaderSrc->add("uniform vec2 uf_windowSpaceToClipSpaceTransform;" _CRLF); + uniformOffsets.offset_windowSpaceToClipSpaceTransform = uniformCurrentOffset; + uniformCurrentOffset += 8; + } + bool alphaTestEnable = decompilerContext->contextRegistersNew->SX_ALPHA_TEST_CONTROL.get_ALPHA_TEST_ENABLE(); + if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel && alphaTestEnable) + { + uniformCurrentOffset = (uniformCurrentOffset + 3)&~3; + shaderSrc->add("uniform float uf_alphaTestRef;" _CRLF); + uniformOffsets.offset_alphaTestRef = uniformCurrentOffset; + uniformCurrentOffset += 4; + } + if (decompilerContext->analyzer.outputPointSize && decompilerContext->analyzer.writesPointSize == false) + { + if ((decompilerContext->shaderType == LatteConst::ShaderType::Vertex && !decompilerContext->options->usesGeometryShader) || + decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + uniformCurrentOffset = (uniformCurrentOffset + 3)&~3; + shaderSrc->add("uniform float uf_pointSize;" _CRLF); + uniformOffsets.offset_pointSize = uniformCurrentOffset; + uniformCurrentOffset += 4; + } + } + // define uf_fragCoordScale which holds the xy scale for render target resolution vs effective resolution + if (shader->shaderType == LatteConst::ShaderType::Pixel) + { + uniformCurrentOffset = (uniformCurrentOffset + 7)&~7; + shaderSrc->add("uniform vec2 uf_fragCoordScale;" _CRLF); + uniformOffsets.offset_fragCoordScale = uniformCurrentOffset; + uniformCurrentOffset += 8; + } + // provide scale factor for every texture that is accessed via texel coordinates (texelFetch) + for (sint32 t = 0; t < LATTE_NUM_MAX_TEX_UNITS; t++) + { + if (decompilerContext->analyzer.texUnitUsesTexelCoordinates.test(t) == false) + continue; + uniformCurrentOffset = (uniformCurrentOffset + 7) & ~7; + shaderSrc->addFmt("uniform vec2 uf_tex{}Scale;" _CRLF, t); + uniformOffsets.offset_texScale[t] = uniformCurrentOffset; + uniformCurrentOffset += 8; + } + // define uf_verticesPerInstance + uf_streamoutBufferBaseX + if (decompilerContext->analyzer.useSSBOForStreamout && + (shader->shaderType == LatteConst::ShaderType::Vertex && decompilerContext->options->usesGeometryShader == false) || + (shader->shaderType == LatteConst::ShaderType::Geometry) ) + { + shaderSrc->add("uniform int uf_verticesPerInstance;" _CRLF); + uniformOffsets.offset_verticesPerInstance = uniformCurrentOffset; + uniformCurrentOffset += 4; + for (uint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) + { + if (decompilerContext->output->streamoutBufferWriteMask[i]) + { + shaderSrc->addFmt("uniform int uf_streamoutBufferBase{};" _CRLF, i); + uniformOffsets.offset_streamoutBufferBase[i] = uniformCurrentOffset; + uniformCurrentOffset += 4; + } + } + } + + uniformOffsets.offset_endOfBlock = uniformCurrentOffset; + } + + static void _emitUniformBuffers(LatteDecompilerShaderContext* decompilerContext) + { + auto shaderSrc = decompilerContext->shaderSource; + // uniform buffer definition + if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) + { + for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) + { + if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess()) + continue; + + cemu_assert_debug(decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i] >= 0); + cemu_assert_debug(decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i] >= 0); + + shaderSrc->addFmt("UNIFORM_BUFFER_LAYOUT({}, {}, {}) ", (sint32)decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i], (sint32)decompilerContext->output->resourceMappingVK.setIndex, (sint32)decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i]); + + shaderSrc->addFmt("uniform ubuff{}" _CRLF, i); + shaderSrc->add("{" _CRLF); + shaderSrc->addFmt("float4 ubuff{}[{}];" _CRLF, i, decompilerContext->analyzer.uniformBufferAccessTracker[i].DetermineSize(decompilerContext->shaderBaseHash, LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE)); + shaderSrc->add("};" _CRLF _CRLF); + shaderSrc->add(_CRLF); + } + } + else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED) + { + // already generated in _emitUniformVariables + } + else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE) + { + // already generated in _emitUniformVariables + } + else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_NONE) + { + // no uniforms used + } + else + { + cemu_assert_debug(false); + } + } + + static void _emitTextureDefinitions(LatteDecompilerShaderContext* shaderContext) + { + auto src = shaderContext->shaderSource; + // texture sampler definition + for (sint32 i = 0; i < LATTE_NUM_MAX_TEX_UNITS; i++) + { + if (!shaderContext->output->textureUnitMask[i]) + continue; + + if (shaderContext->shader->textureIsIntegerFormat[i]) + { + // integer samplers + if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_1D) + src->add("texture1d"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D || shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_MSAA) + src->add("texture2d"); + else + cemu_assert_unimplemented(); + } + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D || shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_MSAA) + src->add("texture2d"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_1D) + src->add("texture1d"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_ARRAY) + src->add("texture2d_array"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_CUBEMAP) + src->add("texturecube_array"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_3D) + src->add("texture3d"); + else + { + cemu_assert_unimplemented(); + } + + src->addFmt(" tex{} [[texture({})]], ", i, shaderContext->output->resourceMappingGL.textureUnitToBindingPoint[i]); + src->addFmt("sampler samplr{} [[sampler({})]], ", i, shaderContext->output->resourceMappingGL.textureUnitToBindingPoint[i]); + } + } + + static void _emitAttributes(LatteDecompilerShaderContext* decompilerContext) + { + auto shaderSrc = decompilerContext->shaderSource; + if (decompilerContext->shader->shaderType == LatteConst::ShaderType::Vertex) + { + // attribute inputs + for (uint32 i = 0; i < LATTE_NUM_MAX_ATTRIBUTE_LOCATIONS; i++) + { + if (decompilerContext->analyzer.inputAttributSemanticMask[i]) + { + cemu_assert_debug(decompilerContext->output->resourceMappingGL.attributeMapping[i] >= 0); + cemu_assert_debug(decompilerContext->output->resourceMappingVK.attributeMapping[i] >= 0); + cemu_assert_debug(decompilerContext->output->resourceMappingGL.attributeMapping[i] == decompilerContext->output->resourceMappingVK.attributeMapping[i]); + + shaderSrc->addFmt("ATTR_LAYOUT({}, {}) in uvec4 attrDataSem{};" _CRLF, (sint32)decompilerContext->output->resourceMappingVK.setIndex, (sint32)decompilerContext->output->resourceMappingVK.attributeMapping[i], i); + } + } + } + } + + static void _emitVSExports(LatteDecompilerShaderContext* shaderContext) + { + auto* src = shaderContext->shaderSource; + LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); + auto parameterMask = shaderContext->shader->outputParameterMask; + for (uint32 i = 0; i < 32; i++) + { + if ((parameterMask&(1 << i)) == 0) + continue; + uint32 vsSemanticId = _getVertexShaderOutParamSemanticId(shaderContext->contextRegisters, i); + if (vsSemanticId > LATTE_ANALYZER_IMPORT_INDEX_PARAM_MAX) + continue; + // get import based on semanticId + sint32 psInputIndex = -1; + for (sint32 f = 0; f < psInputTable->count; f++) + { + if (psInputTable->import[f].semanticId == vsSemanticId) + { + psInputIndex = f; + break; + } + } + if (psInputIndex == -1) + continue; // no ps input + + src->addFmt("layout(location = {}) ", psInputIndex); + if (psInputTable->import[psInputIndex].isFlat) + src->add("flat "); + if (psInputTable->import[psInputIndex].isNoPerspective) + src->add("noperspective "); + src->add("out"); + src->addFmt(" vec4 passParameterSem{};" _CRLF, psInputTable->import[psInputIndex].semanticId); + } + } + + static void _emitPSImports(LatteDecompilerShaderContext* shaderContext) + { + auto* src = shaderContext->shaderSource; + LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); + for (sint32 i = 0; i < psInputTable->count; i++) + { + if (psInputTable->import[i].semanticId > LATTE_ANALYZER_IMPORT_INDEX_PARAM_MAX) + continue; + src->addFmt("layout(location = {}) ", i); + if (psInputTable->import[i].isFlat) + src->add("flat "); + if (psInputTable->import[i].isNoPerspective) + src->add("noperspective "); + src->add("in"); + src->addFmt(" vec4 passParameterSem{};" _CRLF, psInputTable->import[i].semanticId); + } + } + + static void _emitMisc(LatteDecompilerShaderContext* decompilerContext) + { + auto src = decompilerContext->shaderSource; + // per-vertex output (VS or GS) + if ((decompilerContext->shaderType == LatteConst::ShaderType::Vertex && !decompilerContext->options->usesGeometryShader) || + (decompilerContext->shaderType == LatteConst::ShaderType::Geometry)) + { + src->add("out gl_PerVertex" _CRLF); + src->add("{" _CRLF); + src->add(" vec4 gl_Position;" _CRLF); + if (decompilerContext->analyzer.outputPointSize) + src->add(" float gl_PointSize;" _CRLF); + src->add("};" _CRLF); + } + // varyings (variables passed from vertex to pixel shader, only if geometry stage is disabled + if (decompilerContext->options->usesGeometryShader == false) + { + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) + { + _emitVSExports(decompilerContext); + } + else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) + { + _emitPSImports(decompilerContext); + } + } + else + { + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex || decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + // parameters shared between vertex shader and geometry shader + src->add("V2G_LAYOUT "); + + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) + src->add("out Vertex" _CRLF); + else if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + src->add("in Vertex" _CRLF); + src->add("{" _CRLF); + uint32 ringParameterCountVS2GS = 0; + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) + { + ringParameterCountVS2GS = decompilerContext->shader->ringParameterCount; + } + else + { + ringParameterCountVS2GS = decompilerContext->shader->ringParameterCountFromPrevStage; + } + for (uint32 f = 0; f < ringParameterCountVS2GS; f++) + src->addFmt(" ivec4 passV2GParameter{};" _CRLF, f); + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) + src->add("}v2g;" _CRLF); + else if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + src->add("}v2g[];" _CRLF); + } + if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + // parameters shared between geometry and pixel shader + uint32 ringItemSize = decompilerContext->contextRegisters[mmSQ_GSVS_RING_ITEMSIZE] & 0x7FFF; + if ((ringItemSize & 0xF) != 0) + debugBreakpoint(); + if (((decompilerContext->contextRegisters[mmSQ_GSVS_RING_ITEMSIZE] & 0x7FFF) & 0xF) != 0) + debugBreakpoint(); + + for (sint32 p = 0; p < decompilerContext->parsedGSCopyShader->numParam; p++) + { + if (decompilerContext->parsedGSCopyShader->paramMapping[p].exportType != 2) + continue; + src->addFmt("layout(location = {}) out vec4 passG2PParameter{};" _CRLF, decompilerContext->parsedGSCopyShader->paramMapping[p].exportParam & 0x7F, (sint32)decompilerContext->parsedGSCopyShader->paramMapping[p].exportParam); + } + } + else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) + { + // pixel shader with geometry shader + LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); + for (sint32 i = 0; i < psInputTable->count; i++) + { + if (psInputTable->import[i].semanticId > LATTE_ANALYZER_IMPORT_INDEX_PARAM_MAX) + continue; + uint32 location = psInputTable->import[i].semanticId & 0x7F; // todo - the range above 128 has special meaning? + + src->addFmt("layout(location = {}) ", location); + if (psInputTable->import[i].isFlat) + src->add("flat "); + if (psInputTable->import[i].isNoPerspective) + src->add("noperspective "); + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) + src->add("out"); + else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) + src->add("in"); + else + debugBreakpoint(); + + src->addFmt(" vec4 passG2PParameter{};" _CRLF, (sint32)location); + } + } + } + // output defines + if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) + { + // generate pixel outputs for pixel shader + for (uint32 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + if ((decompilerContext->shader->pixelColorOutputMask&(1 << i)) != 0) + { + src->addFmt("layout(location = {}) out vec4 passPixelColor{};" _CRLF, i, i); + } + } + } + // streamout buffer (transform feedback) + if ((decompilerContext->shaderType == LatteConst::ShaderType::Vertex || decompilerContext->shaderType == LatteConst::ShaderType::Geometry) && decompilerContext->analyzer.hasStreamoutEnable) + { + if (decompilerContext->options->useTFViaSSBO) + { + if (decompilerContext->analyzer.useSSBOForStreamout && decompilerContext->analyzer.hasStreamoutWrite) + { + src->addFmt("layout(set = {}, binding = {}) buffer StreamoutBuffer" _CRLF, decompilerContext->output->resourceMappingVK.setIndex, decompilerContext->output->resourceMappingVK.getTFStorageBufferBindingPoint()); + src->add("{" _CRLF); + src->add("int sb_buffer[];" _CRLF); + src->add("};" _CRLF); + } + } + else + { + sint32 locationOffset = 0; // glslang wants a location for xfb outputs + for (uint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) + { + if (!decompilerContext->output->streamoutBufferWriteMask[i]) + continue; + uint32 bufferStride = decompilerContext->output->streamoutBufferStride[i]; + src->addFmt("XFB_BLOCK_LAYOUT({}, {}, {}) out XfbBlock{} " _CRLF, i, bufferStride, locationOffset, i); + src->add("{" _CRLF); + src->addFmt("layout(xfb_buffer = {}, xfb_offset = 0) int sb{}[{}];" _CRLF, i, i, decompilerContext->output->streamoutBufferStride[i] / 4); + src->add("};" _CRLF); + locationOffset += (decompilerContext->output->streamoutBufferStride[i] / 4); + } + } + } + } + + static void emitHeader(LatteDecompilerShaderContext* decompilerContext) + { + const bool dump_shaders_enabled = ActiveSettings::DumpShadersEnabled(); + if(dump_shaders_enabled) + decompilerContext->shaderSource->add("// start of shader inputs/outputs, predetermined by Cemu. Do not touch" _CRLF); + // uniform variables + _emitUniformVariables(decompilerContext, decompilerContext->output->uniformOffsetsVK); + // uniform buffers + _emitUniformBuffers(decompilerContext); + // textures + _emitTextureDefinitions(decompilerContext); + // attributes + _emitAttributes(decompilerContext); + // misc stuff + _emitMisc(decompilerContext); + + if (dump_shaders_enabled) + decompilerContext->shaderSource->add("// end of shader inputs/outputs" _CRLF); + } +} diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h index ed1858ba..4b85d458 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h @@ -47,7 +47,7 @@ struct LatteDecompilerTEXInstruction sint32 dstGpr; sint8 dstSel[4]; // texture fetch - struct + struct { sint32 textureIndex{}; sint32 samplerIndex{}; @@ -216,7 +216,7 @@ struct LatteDecompilerShaderContext bool genIntReg; // if set, generate R*i register variables bool useArrayGPRs; // if set, an array is used to represent GPRs instead of individual variables }typeTracker; - // analyzer + // analyzer struct { // general @@ -268,9 +268,10 @@ struct LatteDecompilerShaderContext void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader); void LatteDecompiler_analyzeDataTypes(LatteDecompilerShaderContext* shaderContext); void LatteDecompiler_emitGLSLShader(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader); +void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader); void LatteDecompiler_cleanup(LatteDecompilerShaderContext* shaderContext); // helper functions -sint32 LatteDecompiler_getColorOutputIndexFromExportIndex(LatteDecompilerShaderContext* shaderContext, sint32 exportIndex); \ No newline at end of file +sint32 LatteDecompiler_getColorOutputIndexFromExportIndex(LatteDecompilerShaderContext* shaderContext, sint32 exportIndex); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index c49035dd..b1710e8a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -3,6 +3,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "HW/Latte/Core/LatteShader.h" #include "gui/guiWrapper.h" MetalRenderer::MetalRenderer() @@ -259,6 +260,8 @@ void MetalRenderer::streamout_rendererFinishDrawcall() void MetalRenderer::draw_beginSequence() { cemuLog_logDebug(LogType::Force, "not implemented"); + + LatteSHRC_UpdateActiveShaders(); } void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst)