diff options
Diffstat (limited to 'src')
23 files changed, 530 insertions, 285 deletions
diff --git a/src/common/common_funcs.h b/src/common/common_funcs.h index 44d8ae11f..4bbcc3571 100644 --- a/src/common/common_funcs.h +++ b/src/common/common_funcs.h @@ -35,6 +35,13 @@ template<> struct CompileTimeAssert<true> {}; #define CONCAT2(x, y) DO_CONCAT2(x, y) #define DO_CONCAT2(x, y) x ## y +// helper macro to properly align structure members. +// Calling INSERT_PADDING_BYTES will add a new member variable with a name like "pad121", +// depending on the current source line to make sure variable names are unique. +#define INSERT_PADDING_BYTES_HELPER1(x, y) x ## y +#define INSERT_PADDING_BYTES_HELPER2(x, y) INSERT_PADDING_BYTES_HELPER1(x, y) +#define INSERT_PADDING_BYTES(num_words) u8 INSERT_PADDING_BYTES_HELPER2(pad, __LINE__)[(num_words)] + #ifndef _MSC_VER #include <errno.h> diff --git a/src/core/arm/dyncom/arm_dyncom_dec.cpp b/src/core/arm/dyncom/arm_dyncom_dec.cpp index ffa627352..9f3b90fd0 100644 --- a/src/core/arm/dyncom/arm_dyncom_dec.cpp +++ b/src/core/arm/dyncom/arm_dyncom_dec.cpp @@ -42,7 +42,7 @@ const ISEITEM arm_instruction[] = { { "srs", 4, 6, 25, 31, 0x0000007c, 22, 22, 0x00000001, 16, 20, 0x0000000d, 8, 11, 0x00000005 }, { "rfe", 4, 6, 25, 31, 0x0000007c, 22, 22, 0x00000000, 20, 20, 0x00000001, 8, 11, 0x0000000a }, - { "bkpt", 2, 3, 20, 31, 0x00000e12, 4, 7, 0x00000007 }, + { "bkpt", 2, 3, 20, 27, 0x00000012, 4, 7, 0x00000007 }, { "blx", 1, 3, 25, 31, 0x0000007d }, { "cps", 3, 6, 20, 31, 0x00000f10, 16, 16, 0x00000000, 5, 5, 0x00000000 }, { "pld", 4, 4, 26, 31, 0x0000003d, 24, 24, 0x00000001, 20, 22, 0x00000005, 12, 15, 0x0000000f }, diff --git a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp index f4b3c4734..b691ffbc3 100644 --- a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp +++ b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp @@ -792,6 +792,7 @@ typedef struct _stm_inst { } stm_inst; struct bkpt_inst { + u32 imm; }; struct blx1_inst { @@ -1371,7 +1372,22 @@ static ARM_INST_PTR INTERPRETER_TRANSLATE(bic)(unsigned int inst, int index) inst_base->br = INDIRECT_BRANCH; return inst_base; } -static ARM_INST_PTR INTERPRETER_TRANSLATE(bkpt)(unsigned int inst, int index) { UNIMPLEMENTED_INSTRUCTION("BKPT"); } + +static ARM_INST_PTR INTERPRETER_TRANSLATE(bkpt)(unsigned int inst, int index) +{ + arm_inst* const inst_base = (arm_inst*)AllocBuffer(sizeof(arm_inst) + sizeof(bkpt_inst)); + bkpt_inst* const inst_cream = (bkpt_inst*)inst_base->component; + + inst_base->cond = BITS(inst, 28, 31); + inst_base->idx = index; + inst_base->br = NON_BRANCH; + inst_base->load_r15 = 0; + + inst_cream->imm = BITS(inst, 8, 19) | BITS(inst, 0, 3); + + return inst_base; +} + static ARM_INST_PTR INTERPRETER_TRANSLATE(blx)(unsigned int inst, int index) { arm_inst *inst_base = (arm_inst *)AllocBuffer(sizeof(arm_inst) + sizeof(blx_inst)); @@ -3211,6 +3227,7 @@ static ARM_INST_PTR INTERPRETER_TRANSLATE(usada8)(unsigned int inst, int index) inst_cream->op1 = BITS(inst, 20, 24); inst_cream->op2 = BITS(inst, 5, 7); + inst_cream->Rd = BITS(inst, 16, 19); inst_cream->Rm = BITS(inst, 8, 11); inst_cream->Rn = BITS(inst, 0, 3); inst_cream->Ra = BITS(inst, 12, 15); @@ -4080,6 +4097,16 @@ unsigned InterpreterMainLoop(ARMul_State* state) { GOTO_NEXT_INST; } BKPT_INST: + { + if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) { + bkpt_inst* const inst_cream = (bkpt_inst*)inst_base->component; + LOG_DEBUG(Core_ARM11, "Breakpoint instruction hit. Immediate: 0x%08X", inst_cream->imm); + } + cpu->Reg[15] += GET_INST_SIZE(cpu); + INC_PC(sizeof(bkpt_inst)); + FETCH_INST; + GOTO_NEXT_INST; + } BLX_INST: { blx_inst *inst_cream = (blx_inst *)inst_base->component; diff --git a/src/core/arm/skyeye_common/armemu.h b/src/core/arm/skyeye_common/armemu.h index 8bfd4e0f0..2a1c50779 100644 --- a/src/core/arm/skyeye_common/armemu.h +++ b/src/core/arm/skyeye_common/armemu.h @@ -35,7 +35,7 @@ enum : u32 { // Masks for groups of bits in the APSR. MODEBITS = 0x1F, - INTBITS = 0xC0, + INTBITS = 0x1C0, }; // Different ways to start the next instruction. diff --git a/src/core/arm/skyeye_common/vfp/vfpdouble.cpp b/src/core/arm/skyeye_common/vfp/vfpdouble.cpp index 17726b8bb..1a05ef8c1 100644 --- a/src/core/arm/skyeye_common/vfp/vfpdouble.cpp +++ b/src/core/arm/skyeye_common/vfp/vfpdouble.cpp @@ -381,7 +381,7 @@ static u32 vfp_compare(ARMul_State* state, int dd, int signal_on_qnan, int dm, u s64 d, m; u32 ret = 0; - LOG_TRACE(Core_ARM11, "In %s, state=0x%x, fpscr=0x%x\n", __FUNCTION__, state, fpscr); + LOG_TRACE(Core_ARM11, "In %s, state=0x%p, fpscr=0x%x\n", __FUNCTION__, state, fpscr); m = vfp_get_double(state, dm); if (vfp_double_packed_exponent(m) == 2047 && vfp_double_packed_mantissa(m)) { ret |= FPSCR_CFLAG | FPSCR_VFLAG; @@ -436,7 +436,7 @@ static u32 vfp_compare(ARMul_State* state, int dd, int signal_on_qnan, int dm, u ret |= FPSCR_CFLAG; } } - LOG_TRACE(Core_ARM11, "In %s, state=0x%x, ret=0x%x\n", __FUNCTION__, state, ret); + LOG_TRACE(Core_ARM11, "In %s, state=0x%p, ret=0x%x\n", __FUNCTION__, state, ret); return ret; } diff --git a/src/core/arm/skyeye_common/vfp/vfpinstr.cpp b/src/core/arm/skyeye_common/vfp/vfpinstr.cpp index 1f1b5b1c3..b9b96c388 100644 --- a/src/core/arm/skyeye_common/vfp/vfpinstr.cpp +++ b/src/core/arm/skyeye_common/vfp/vfpinstr.cpp @@ -1443,7 +1443,7 @@ VPUSH_INST: addr = cpu->Reg[R13] - inst_cream->imm32; - for (int i = 0; i < inst_cream->regs; i++) + for (unsigned int i = 0; i < inst_cream->regs; i++) { if (inst_cream->single) { @@ -1512,7 +1512,7 @@ VSTM_INST: /* encoding 1 */ addr = (inst_cream->add ? cpu->Reg[inst_cream->n] : cpu->Reg[inst_cream->n] - inst_cream->imm32); - for (int i = 0; i < inst_cream->regs; i++) + for (unsigned int i = 0; i < inst_cream->regs; i++) { if (inst_cream->single) { @@ -1581,7 +1581,7 @@ VPOP_INST: addr = cpu->Reg[R13]; - for (int i = 0; i < inst_cream->regs; i++) + for (unsigned int i = 0; i < inst_cream->regs; i++) { if (inst_cream->single) { @@ -1718,7 +1718,7 @@ VLDM_INST: addr = (inst_cream->add ? cpu->Reg[inst_cream->n] : cpu->Reg[inst_cream->n] - inst_cream->imm32); - for (int i = 0; i < inst_cream->regs; i++) + for (unsigned int i = 0; i < inst_cream->regs; i++) { if (inst_cream->single) { diff --git a/src/core/hle/config_mem.cpp b/src/core/hle/config_mem.cpp index 68d3071f5..b10c19d1d 100644 --- a/src/core/hle/config_mem.cpp +++ b/src/core/hle/config_mem.cpp @@ -3,60 +3,54 @@ // Refer to the license.txt file included. #include "common/common_types.h" -#include "common/logging/log.h" +#include "common/common_funcs.h" +#include "core/core.h" +#include "core/mem_map.h" #include "core/hle/config_mem.h" //////////////////////////////////////////////////////////////////////////////////////////////////// namespace ConfigMem { -enum { - KERNEL_VERSIONREVISION = 0x1FF80001, - KERNEL_VERSIONMINOR = 0x1FF80002, - KERNEL_VERSIONMAJOR = 0x1FF80003, - UPDATEFLAG = 0x1FF80004, - NSTID = 0x1FF80008, - SYSCOREVER = 0x1FF80010, - UNITINFO = 0x1FF80014, - KERNEL_CTRSDKVERSION = 0x1FF80018, - APPMEMTYPE = 0x1FF80030, - APPMEMALLOC = 0x1FF80040, - FIRM_VERSIONREVISION = 0x1FF80061, - FIRM_VERSIONMINOR = 0x1FF80062, - FIRM_VERSIONMAJOR = 0x1FF80063, - FIRM_SYSCOREVER = 0x1FF80064, - FIRM_CTRSDKVERSION = 0x1FF80068, +struct ConfigMemDef { + u8 kernel_unk; // 0 + u8 kernel_version_rev; // 1 + u8 kernel_version_min; // 2 + u8 kernel_version_maj; // 3 + u32 update_flag; // 4 + u64 ns_tid; // 8 + u32 sys_core_ver; // 10 + u8 unit_info; // 14 + u8 boot_firm; // 15 + u8 prev_firm; // 16 + INSERT_PADDING_BYTES(0x1); // 17 + u32 ctr_sdk_ver; // 18 + INSERT_PADDING_BYTES(0x30 - 0x1C); // 1C + u32 app_mem_type; // 30 + INSERT_PADDING_BYTES(0x40 - 0x34); // 34 + u32 app_mem_alloc; // 40 + u32 sys_mem_alloc; // 44 + u32 base_mem_alloc; // 48 + INSERT_PADDING_BYTES(0x60 - 0x4C); // 4C + u8 firm_unk; // 60 + u8 firm_version_rev; // 61 + u8 firm_version_min; // 62 + u8 firm_version_maj; // 63 + u32 firm_sys_core_ver; // 64 + u32 firm_ctr_sdk_ver; // 68 + INSERT_PADDING_BYTES(0x1000 - 0x6C); // 6C }; -template <typename T> -inline void Read(T &var, const u32 addr) { - switch (addr) { - - // Bit 0 set for Retail - case UNITINFO: - var = 0x00000001; - break; - - // Set app memory size to 64MB? - case APPMEMALLOC: - var = 0x04000000; - break; +static_assert(sizeof(ConfigMemDef) == Memory::CONFIG_MEMORY_SIZE, "Config Memory structure size is wrong"); - // Unknown - normally set to: 0x08000000 - (APPMEMALLOC + *0x1FF80048) - // (Total FCRAM size - APPMEMALLOC - *0x1FF80048) - case 0x1FF80044: - var = 0x08000000 - (0x04000000 + 0x1400000); - break; +static ConfigMemDef config_mem; - // Unknown - normally set to: 0x1400000 (20MB) - case 0x1FF80048: - var = 0x1400000; - break; - - default: - LOG_ERROR(Kernel, "unknown addr=0x%08X", addr); - } +template <typename T> +inline void Read(T &var, const u32 addr) { + u32 offset = addr - Memory::CONFIG_MEMORY_VADDR; + ASSERT(offset < Memory::CONFIG_MEMORY_SIZE); + var = *(reinterpret_cast<T*>(((uintptr_t)&config_mem) + offset)); } // Explicitly instantiate template functions because we aren't defining this in the header: @@ -66,5 +60,21 @@ template void Read<u32>(u32 &var, const u32 addr); template void Read<u16>(u16 &var, const u32 addr); template void Read<u8>(u8 &var, const u32 addr); +void Init() { + config_mem.update_flag = 0; // No update + config_mem.sys_core_ver = 0x2; + config_mem.unit_info = 0x1; // Bit 0 set for Retail + config_mem.prev_firm = 0; + config_mem.app_mem_type = 0; // Defualt app mem type + config_mem.unit_info = 0x1; // Bit 0 set for Retail + config_mem.app_mem_alloc = 0x04000000; // Default app memory size is 64MB + config_mem.base_mem_alloc = 0x01400000; // Default base memory is 20MB + config_mem.sys_mem_alloc = Memory::FCRAM_SIZE - (config_mem.app_mem_alloc + config_mem.base_mem_alloc); + config_mem.firm_unk = 0; + config_mem.firm_version_rev = 0; + config_mem.firm_version_min = 0x40; + config_mem.firm_version_maj = 0x2; + config_mem.firm_sys_core_ver = 0x2; +} } // namespace diff --git a/src/core/hle/config_mem.h b/src/core/hle/config_mem.h index 3975af18f..94853901a 100644 --- a/src/core/hle/config_mem.h +++ b/src/core/hle/config_mem.h @@ -18,4 +18,6 @@ namespace ConfigMem { template <typename T> void Read(T &var, const u32 addr); +void Init(); + } // namespace diff --git a/src/core/hle/hle.cpp b/src/core/hle/hle.cpp index 529133ca7..b0066e15e 100644 --- a/src/core/hle/hle.cpp +++ b/src/core/hle/hle.cpp @@ -7,6 +7,7 @@ #include "core/arm/arm_interface.h" #include "core/mem_map.h" #include "core/hle/hle.h" +#include "core/hle/config_mem.h" #include "core/hle/shared_page.h" #include "core/hle/kernel/thread.h" #include "core/hle/service/service.h" @@ -75,6 +76,7 @@ void Init() { RegisterAllModules(); + ConfigMem::Init(); SharedPage::Init(); LOG_DEBUG(Kernel, "initialized OK"); diff --git a/src/core/hle/service/gsp_gpu.cpp b/src/core/hle/service/gsp_gpu.cpp index 31e61391f..c23cfa3c8 100644 --- a/src/core/hle/service/gsp_gpu.cpp +++ b/src/core/hle/service/gsp_gpu.cpp @@ -368,28 +368,28 @@ static void ExecuteCommand(const Command& command, u32 thread_id) { case CommandId::SET_MEMORY_FILL: { auto& params = command.memory_fill; - WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_start)), - Memory::VirtualToPhysicalAddress(params.start1) >> 3); - WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_end)), - Memory::VirtualToPhysicalAddress(params.end1) >> 3); - WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].size)), params.end1 - params.start1); - WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].value)), params.value1); - - WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_start)), - Memory::VirtualToPhysicalAddress(params.start2) >> 3); - WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_end)), - Memory::VirtualToPhysicalAddress(params.end2) >> 3); - WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].size)), params.end2 - params.start2); - WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].value)), params.value2); + WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_start)), + Memory::VirtualToPhysicalAddress(params.start1) >> 3); + WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_end)), + Memory::VirtualToPhysicalAddress(params.end1) >> 3); + WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].value_32bit)), params.value1); + WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].control)), params.control1); + + WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_start)), + Memory::VirtualToPhysicalAddress(params.start2) >> 3); + WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_end)), + Memory::VirtualToPhysicalAddress(params.end2) >> 3); + WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].value_32bit)), params.value2); + WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].control)), params.control2); break; } case CommandId::SET_DISPLAY_TRANSFER: { auto& params = command.image_copy; - WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)), + WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)), Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3); - WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)), + WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)), Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3); WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_size)), params.in_buffer_size); WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_size)), params.out_buffer_size); @@ -402,9 +402,9 @@ static void ExecuteCommand(const Command& command, u32 thread_id) { case CommandId::SET_TEXTURE_COPY: { auto& params = command.image_copy; - WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)), + WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)), Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3); - WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)), + WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)), Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3); WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_size)), params.in_buffer_size); WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_size)), params.out_buffer_size); diff --git a/src/core/hle/service/gsp_gpu.h b/src/core/hle/service/gsp_gpu.h index 65abb194a..a435d418a 100644 --- a/src/core/hle/service/gsp_gpu.h +++ b/src/core/hle/service/gsp_gpu.h @@ -109,9 +109,13 @@ struct Command { u32 start1; u32 value1; u32 end1; + u32 start2; u32 value2; u32 end2; + + u16 control1; + u16 control2; } memory_fill; struct { diff --git a/src/core/hle/service/service.cpp b/src/core/hle/service/service.cpp index e0979ea5d..5dce8068e 100644 --- a/src/core/hle/service/service.cpp +++ b/src/core/hle/service/service.cpp @@ -71,6 +71,7 @@ static void AddService(Interface* interface) { /// Initialize ServiceManager void Init() { AddNamedPort(new SRV::Interface); + AddNamedPort(new ERR_F::Interface); AddService(new AC_U::Interface); AddService(new ACT_U::Interface); @@ -90,7 +91,6 @@ void Init() { AddService(new CFG_U::Interface); AddService(new CSND_SND::Interface); AddService(new DSP_DSP::Interface); - AddService(new ERR_F::Interface); AddService(new FRD_A::Interface); AddService(new FRD_U::Interface); AddService(new FS::FSUserInterface); diff --git a/src/core/hle/shared_page.cpp b/src/core/hle/shared_page.cpp index f0726ef09..568dad684 100644 --- a/src/core/hle/shared_page.cpp +++ b/src/core/hle/shared_page.cpp @@ -3,6 +3,7 @@ // Refer to the license.txt file included. #include "common/common_types.h" +#include "common/common_funcs.h" #include "core/core.h" #include "core/mem_map.h" @@ -13,13 +14,6 @@ namespace SharedPage { -// helper macro to properly align structure members. -// Calling INSERT_PADDING_BYTES will add a new member variable with a name like "pad121", -// depending on the current source line to make sure variable names are unique. -#define INSERT_PADDING_BYTES_HELPER1(x, y) x ## y -#define INSERT_PADDING_BYTES_HELPER2(x, y) INSERT_PADDING_BYTES_HELPER1(x, y) -#define INSERT_PADDING_BYTES(num_words) u8 INSERT_PADDING_BYTES_HELPER2(pad, __LINE__)[(num_words)] - // see http://3dbrew.org/wiki/Configuration_Memory#Shared_Memory_Page_For_ARM11_Processes #pragma pack(1) diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp index aad0e5d0d..bd7d92cd1 100644 --- a/src/core/hw/gpu.cpp +++ b/src/core/hw/gpu.cpp @@ -67,23 +67,38 @@ inline void Write(u32 addr, const T data) { switch (index) { // Memory fills are triggered once the fill value is written. - // NOTE: This is not verified. - case GPU_REG_INDEX_WORKAROUND(memory_fill_config[0].value, 0x00004 + 0x3): - case GPU_REG_INDEX_WORKAROUND(memory_fill_config[1].value, 0x00008 + 0x3): + case GPU_REG_INDEX_WORKAROUND(memory_fill_config[0].trigger, 0x00004 + 0x3): + case GPU_REG_INDEX_WORKAROUND(memory_fill_config[1].trigger, 0x00008 + 0x3): { - const bool is_second_filler = (index != GPU_REG_INDEX(memory_fill_config[0].value)); - const auto& config = g_regs.memory_fill_config[is_second_filler]; - - // TODO: Not sure if this check should be done at GSP level instead - if (config.address_start) { - // TODO: Not sure if this algorithm is correct, particularly because it doesn't use the size member at all - u32* start = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetStartAddress())); - u32* end = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetEndAddress())); - for (u32* ptr = start; ptr < end; ++ptr) - *ptr = bswap32(config.value); // TODO: This is just a workaround to missing framebuffer format emulation + const bool is_second_filler = (index != GPU_REG_INDEX(memory_fill_config[0].trigger)); + auto& config = g_regs.memory_fill_config[is_second_filler]; + + if (config.address_start && config.trigger) { + u8* start = Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetStartAddress())); + u8* end = Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetEndAddress())); + + if (config.fill_24bit) { + // fill with 24-bit values + for (u8* ptr = start; ptr < end; ptr += 3) { + ptr[0] = config.value_24bit_b; + ptr[1] = config.value_24bit_g; + ptr[2] = config.value_24bit_r; + } + } else if (config.fill_32bit) { + // fill with 32-bit values + for (u32* ptr = (u32*)start; ptr < (u32*)end; ++ptr) + *ptr = config.value_32bit; + } else { + // fill with 16-bit values + for (u16* ptr = (u16*)start; ptr < (u16*)end; ++ptr) + *ptr = config.value_16bit; + } LOG_TRACE(HW_GPU, "MemoryFill from 0x%08x to 0x%08x", config.GetStartAddress(), config.GetEndAddress()); + config.trigger = 0; + config.finished = 1; + if (!is_second_filler) { GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PSC0); } else { diff --git a/src/core/hw/gpu.h b/src/core/hw/gpu.h index 7c3a17ee5..df9aa0d71 100644 --- a/src/core/hw/gpu.h +++ b/src/core/hw/gpu.h @@ -53,6 +53,7 @@ struct Regs { "Structure size and register block length don't match") #endif + // All of those formats are described in reverse byte order, since the 3DS is little-endian. enum class PixelFormat : u32 { RGBA8 = 0, RGB8 = 1, @@ -61,13 +62,57 @@ struct Regs { RGBA4 = 4, }; + /** + * Returns the number of bytes per pixel. + */ + static int BytesPerPixel(PixelFormat format) { + switch (format) { + case PixelFormat::RGBA8: + return 4; + case PixelFormat::RGB8: + return 3; + case PixelFormat::RGB565: + case PixelFormat::RGB5A1: + case PixelFormat::RGBA4: + return 2; + default: + UNIMPLEMENTED(); + } + } + INSERT_PADDING_WORDS(0x4); struct { u32 address_start; - u32 address_end; // ? - u32 size; - u32 value; // ? + u32 address_end; + + union { + u32 value_32bit; + + BitField<0, 16, u32> value_16bit; + + // TODO: Verify component order + BitField< 0, 8, u32> value_24bit_r; + BitField< 8, 8, u32> value_24bit_g; + BitField<16, 8, u32> value_24bit_b; + }; + + union { + u32 control; + + // Setting this field to 1 triggers the memory fill. + // This field also acts as a status flag, and gets reset to 0 upon completion. + BitField<0, 1, u32> trigger; + + // Set to 1 upon completion. + BitField<0, 1, u32> finished; + + // 0: fill with 16- or 32-bit wide values; 1: fill with 24-bit wide values + BitField<8, 1, u32> fill_24bit; + + // 0: fill with 16-bit wide values; 1: fill with 32-bit wide values + BitField<9, 1, u32> fill_32bit; + }; inline u32 GetStartAddress() const { return DecodeAddressRegister(address_start); diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp index 1744066ba..ba3876a76 100644 --- a/src/video_core/clipper.cpp +++ b/src/video_core/clipper.cpp @@ -15,30 +15,18 @@ namespace Clipper { struct ClippingEdge { public: - enum Type { - POS_X = 0, - NEG_X = 1, - POS_Y = 2, - NEG_Y = 3, - POS_Z = 4, - NEG_Z = 5, - }; - - ClippingEdge(Type type, float24 position) : type(type), pos(position) {} + ClippingEdge(Math::Vec4<float24> coeffs, + Math::Vec4<float24> bias = Math::Vec4<float24>(float24::FromFloat32(0), + float24::FromFloat32(0), + float24::FromFloat32(0), + float24::FromFloat32(0))) + : coeffs(coeffs), + bias(bias) + { + } bool IsInside(const OutputVertex& vertex) const { - switch (type) { - case POS_X: return vertex.pos.x <= pos * vertex.pos.w; - case NEG_X: return vertex.pos.x >= pos * vertex.pos.w; - case POS_Y: return vertex.pos.y <= pos * vertex.pos.w; - case NEG_Y: return vertex.pos.y >= pos * vertex.pos.w; - - // TODO: Check z compares ... should be 0..1 instead? - case POS_Z: return vertex.pos.z <= pos * vertex.pos.w; - - default: - case NEG_Z: return vertex.pos.z >= pos * vertex.pos.w; - } + return Math::Dot(vertex.pos + bias, coeffs) <= float24::FromFloat32(0); } bool IsOutSide(const OutputVertex& vertex) const { @@ -46,31 +34,17 @@ public: } OutputVertex GetIntersection(const OutputVertex& v0, const OutputVertex& v1) const { - auto dotpr = [this](const OutputVertex& vtx) { - switch (type) { - case POS_X: return vtx.pos.x - vtx.pos.w; - case NEG_X: return -vtx.pos.x - vtx.pos.w; - case POS_Y: return vtx.pos.y - vtx.pos.w; - case NEG_Y: return -vtx.pos.y - vtx.pos.w; - - // TODO: Verify z clipping - case POS_Z: return vtx.pos.z - vtx.pos.w; - - default: - case NEG_Z: return -vtx.pos.w; - } - }; - - float24 dp = dotpr(v0); - float24 dp_prev = dotpr(v1); + float24 dp = Math::Dot(v0.pos + bias, coeffs); + float24 dp_prev = Math::Dot(v1.pos + bias, coeffs); float24 factor = dp_prev / (dp_prev - dp); return OutputVertex::Lerp(factor, v0, v1); } private: - Type type; float24 pos; + Math::Vec4<float24> coeffs; + Math::Vec4<float24> bias; }; static void InitScreenCoordinates(OutputVertex& vtx) @@ -98,10 +72,9 @@ static void InitScreenCoordinates(OutputVertex& vtx) vtx.tc2 *= inv_w; vtx.pos.w = inv_w; - // TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x; vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y; - vtx.screenpos[2] = viewport.offset_z - vtx.pos.z * inv_w * viewport.zscale; + vtx.screenpos[2] = viewport.offset_z + vtx.pos.z * inv_w * viewport.zscale; } void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { @@ -117,14 +90,29 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { auto* output_list = &buffer_a; auto* input_list = &buffer_b; + // NOTE: We clip against a w=epsilon plane to guarantee that the output has a positive w value. + // TODO: Not sure if this is a valid approach. Also should probably instead use the smallest + // epsilon possible within float24 accuracy. + static const float24 EPSILON = float24::FromFloat32(0.00001); + static const float24 f0 = float24::FromFloat32(0.0); + static const float24 f1 = float24::FromFloat32(1.0); + static const std::array<ClippingEdge, 7> clipping_edges = {{ + { Math::MakeVec( f1, f0, f0, -f1) }, // x = +w + { Math::MakeVec(-f1, f0, f0, -f1) }, // x = -w + { Math::MakeVec( f0, f1, f0, -f1) }, // y = +w + { Math::MakeVec( f0, -f1, f0, -f1) }, // y = -w + { Math::MakeVec( f0, f0, f1, f0) }, // z = 0 + { Math::MakeVec( f0, f0, -f1, -f1) }, // z = -w + { Math::MakeVec( f0, f0, f0, -f1), Math::Vec4<float24>(f0, f0, f0, EPSILON) }, // w = EPSILON + }}; + + // TODO: If one vertex lies outside one of the depth clipping planes, some platforms (e.g. Wii) + // drop the whole primitive instead of clipping the primitive properly. We should test if + // this happens on the 3DS, too. + // Simple implementation of the Sutherland-Hodgman clipping algorithm. // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here) - for (auto edge : { ClippingEdge(ClippingEdge::POS_X, float24::FromFloat32(+1.0)), - ClippingEdge(ClippingEdge::NEG_X, float24::FromFloat32(-1.0)), - ClippingEdge(ClippingEdge::POS_Y, float24::FromFloat32(+1.0)), - ClippingEdge(ClippingEdge::NEG_Y, float24::FromFloat32(-1.0)), - ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)), - ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) { + for (auto edge : clipping_edges) { std::swap(input_list, output_list); output_list->clear(); diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 0d9f4ba66..586ad62b6 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -2,6 +2,8 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <boost/range/algorithm/fill.hpp> + #include "clipper.h" #include "command_processor.h" #include "math.h" @@ -23,10 +25,6 @@ static int float_regs_counter = 0; static u32 uniform_write_buffer[4]; -// Used for VSLoadProgramData and VSLoadSwizzleData -static u32 vs_binary_write_offset = 0; -static u32 vs_swizzle_write_offset = 0; - static inline void WritePicaReg(u32 id, u32 value, u32 mask) { if (id >= registers.NumIds()) @@ -65,10 +63,14 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { // Information about internal vertex attributes u32 vertex_attribute_sources[16]; - std::fill(vertex_attribute_sources, &vertex_attribute_sources[16], 0xdeadbeef); + boost::fill(vertex_attribute_sources, 0xdeadbeef); u32 vertex_attribute_strides[16]; u32 vertex_attribute_formats[16]; - u32 vertex_attribute_elements[16]; + + // HACK: Initialize vertex_attribute_elements to zero to prevent infinite loops below. + // This is one of the hacks required to deal with uninitalized vertex attributes. + // TODO: Fix this properly. + u32 vertex_attribute_elements[16] = {}; u32 vertex_attribute_element_size[16]; // Setup attribute data from loaders @@ -252,11 +254,6 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { break; } - // Seems to be used to reset the write pointer for VSLoadProgramData - case PICA_REG_INDEX(vs_program.begin_load): - vs_binary_write_offset = 0; - break; - // Load shader program code case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc): case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd): @@ -267,16 +264,11 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[6], 0x2d2): case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[7], 0x2d3): { - VertexShader::SubmitShaderMemoryChange(vs_binary_write_offset, value); - vs_binary_write_offset++; + VertexShader::SubmitShaderMemoryChange(registers.vs_program.offset, value); + registers.vs_program.offset++; break; } - // Seems to be used to reset the write pointer for VSLoadSwizzleData - case PICA_REG_INDEX(vs_swizzle_patterns.begin_load): - vs_swizzle_write_offset = 0; - break; - // Load swizzle pattern data case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[0], 0x2d6): case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[1], 0x2d7): @@ -287,8 +279,8 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[6], 0x2dc): case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[7], 0x2dd): { - VertexShader::SubmitSwizzleDataChange(vs_swizzle_write_offset, value); - vs_swizzle_write_offset++; + VertexShader::SubmitSwizzleDataChange(registers.vs_swizzle_patterns.offset, value); + registers.vs_swizzle_patterns.offset++; break; } diff --git a/src/video_core/math.h b/src/video_core/math.h index c176b225a..f9a822658 100644 --- a/src/video_core/math.h +++ b/src/video_core/math.h @@ -631,7 +631,7 @@ static inline Vec4<T> MakeVec(const Vec3<T>& xyz, const T& w) } template<typename T> -static inline Vec4<T> MakeVec(const T& x, const Vec2<T>& yzw) +static inline Vec4<T> MakeVec(const T& x, const Vec3<T>& yzw) { return MakeVec(x, yzw[0], yzw[1], yzw[2]); } diff --git a/src/video_core/pica.h b/src/video_core/pica.h index 9c1a12dc8..e4a5ef78e 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h @@ -118,8 +118,9 @@ struct Regs { struct TextureConfig { enum WrapMode : u32 { - ClampToEdge = 0, - Repeat = 2, + ClampToEdge = 0, + Repeat = 2, + MirroredRepeat = 3, }; INSERT_PADDING_WORDS(0x1); @@ -131,7 +132,7 @@ struct Regs { union { BitField< 8, 2, WrapMode> wrap_s; - BitField<11, 2, WrapMode> wrap_t; + BitField<12, 2, WrapMode> wrap_t; }; INSERT_PADDING_WORDS(0x1); @@ -223,6 +224,8 @@ struct Regs { struct TevStageConfig { enum class Source : u32 { PrimaryColor = 0x0, + PrimaryFragmentColor = 0x1, + Texture0 = 0x3, Texture1 = 0x4, Texture2 = 0x5, @@ -265,6 +268,9 @@ struct Regs { AddSigned = 3, Lerp = 4, Subtract = 5, + + MultiplyThenAdd = 8, + AddThenMultiply = 9, }; union { @@ -337,7 +343,7 @@ struct Regs { }; union { - enum BlendEquation : u32 { + enum class BlendEquation : u32 { Add = 0, Subtract = 1, ReverseSubtract = 2, @@ -421,7 +427,7 @@ struct Regs { INSERT_PADDING_WORDS(0x6); u32 depth_format; - u32 color_format; + BitField<16, 3, u32> color_format; INSERT_PADDING_WORDS(0x4); @@ -678,7 +684,9 @@ struct Regs { INSERT_PADDING_WORDS(0x2); struct { - u32 begin_load; + // Offset of the next instruction to write code to. + // Incremented with each instruction write. + u32 offset; // Writing to these registers sets the "current" word in the shader program. // TODO: It's not clear how the hardware stores what the "current" word is. @@ -690,7 +698,9 @@ struct Regs { // This register group is used to load an internal table of swizzling patterns, // which are indexed by each shader instruction to specify vector component swizzling. struct { - u32 begin_load; + // Offset of the next swizzle pattern to write code to. + // Incremented with each instruction write. + u32 offset; // Writing to these registers sets the "current" swizzle pattern in the table. // TODO: It's not clear how the hardware stores what the "current" swizzle pattern is. diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp index 3faa10153..94873f406 100644 --- a/src/video_core/rasterizer.cpp +++ b/src/video_core/rasterizer.cpp @@ -5,6 +5,7 @@ #include <algorithm> #include "common/common_types.h" +#include "common/math_util.h" #include "math.h" #include "pica.h" @@ -20,16 +21,31 @@ namespace Rasterizer { static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) { const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress(); u32* color_buffer = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr))); - u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b(); - // Assuming RGBA8 format until actual framebuffer format handling is implemented - *(color_buffer + x + y * registers.framebuffer.GetWidth()) = value; + // Similarly to textures, the render framebuffer is laid out from bottom to top, too. + // NOTE: The framebuffer height register contains the actual FB height minus one. + y = (registers.framebuffer.height - y); + + switch (registers.framebuffer.color_format) { + case registers.framebuffer.RGBA8: + { + u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b(); + *(color_buffer + x + y * registers.framebuffer.GetWidth()) = value; + break; + } + + default: + LOG_CRITICAL(Render_Software, "Unknown framebuffer color format %x", registers.framebuffer.color_format); + UNIMPLEMENTED(); + } } static const Math::Vec4<u8> GetPixel(int x, int y) { const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress(); u32* color_buffer_u32 = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr))); + y = (registers.framebuffer.height - y); + u32 value = *(color_buffer_u32 + x + y * registers.framebuffer.GetWidth()); Math::Vec4<u8> ret; ret.a() = value >> 24; @@ -43,6 +59,8 @@ static u32 GetDepth(int x, int y) { const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress(); u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr))); + y = (registers.framebuffer.height - y); + // Assuming 16-bit depth buffer format until actual format handling is implemented return *(depth_buffer + x + y * registers.framebuffer.GetWidth()); } @@ -51,6 +69,8 @@ static void SetDepth(int x, int y, u16 value) { const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress(); u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr))); + y = (registers.framebuffer.height - y); + // Assuming 16-bit depth buffer format until actual format handling is implemented *(depth_buffer + x + y * registers.framebuffer.GetWidth()) = value; } @@ -90,30 +110,43 @@ static int SignedArea (const Math::Vec2<Fix12P4>& vtx1, return Math::Cross(vec1, vec2).z; }; -void ProcessTriangle(const VertexShader::OutputVertex& v0, - const VertexShader::OutputVertex& v1, - const VertexShader::OutputVertex& v2) +/** + * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing + * culling via recursion. + */ +static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0, + const VertexShader::OutputVertex& v1, + const VertexShader::OutputVertex& v2, + bool reversed = false) { // vertex positions in rasterizer coordinates - auto FloatToFix = [](float24 flt) { - return Fix12P4(static_cast<unsigned short>(flt.ToFloat32() * 16.0f)); - }; - auto ScreenToRasterizerCoordinates = [FloatToFix](const Math::Vec3<float24> vec) { - return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)}; - }; + static auto FloatToFix = [](float24 flt) { + // TODO: Rounding here is necessary to prevent garbage pixels at + // triangle borders. Is it that the correct solution, though? + return Fix12P4(static_cast<unsigned short>(round(flt.ToFloat32() * 16.0f))); + }; + static auto ScreenToRasterizerCoordinates = [](const Math::Vec3<float24>& vec) { + return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)}; + }; Math::Vec3<Fix12P4> vtxpos[3]{ ScreenToRasterizerCoordinates(v0.screenpos), ScreenToRasterizerCoordinates(v1.screenpos), ScreenToRasterizerCoordinates(v2.screenpos) }; - if (registers.cull_mode == Regs::CullMode::KeepClockWise) { - // Reverse vertex order and use the CCW code path. - std::swap(vtxpos[1], vtxpos[2]); - } + if (registers.cull_mode == Regs::CullMode::KeepAll) { + // Make sure we always end up with a triangle wound counter-clockwise + if (!reversed && SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) { + ProcessTriangleInternal(v0, v2, v1, true); + return; + } + } else { + if (!reversed && registers.cull_mode == Regs::CullMode::KeepClockWise) { + // Reverse vertex order and use the CCW code path. + ProcessTriangleInternal(v0, v2, v1, true); + return; + } - if (registers.cull_mode != Regs::CullMode::KeepAll) { // Cull away triangles which are wound clockwise. - // TODO: A check for degenerate triangles ("== 0") should be considered for CullMode::KeepAll if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) return; } @@ -155,9 +188,10 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, auto textures = registers.GetTextures(); auto tev_stages = registers.GetTevStages(); + // Enter rasterization loop, starting at the center of the topleft bounding box corner. // TODO: Not sure if looping through x first might be faster - for (u16 y = min_y; y < max_y; y += 0x10) { - for (u16 x = min_x; x < max_x; x += 0x10) { + for (u16 y = min_y + 8; y < max_y; y += 0x10) { + for (u16 x = min_x + 8; x < max_x; x += 0x10) { // Calculate the barycentric coordinates w0, w1 and w2 int w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y}); @@ -220,7 +254,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, int s = (int)(uv[i].u() * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32(); int t = (int)(uv[i].v() * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32(); - auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) { + static auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) { switch (mode) { case Regs::TextureConfig::ClampToEdge: val = std::max(val, 0); @@ -228,7 +262,15 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, return val; case Regs::TextureConfig::Repeat: - return (int)(((unsigned)val) % size); + return (int)((unsigned)val % size); + + case Regs::TextureConfig::MirroredRepeat: + { + int val = (int)((unsigned)val % (2 * size)); + if (val >= size) + val = 2 * size - 1 - val; + return val; + } default: LOG_ERROR(HW_GPU, "Unknown texture coordinate wrapping mode %x\n", (int)mode); @@ -236,6 +278,10 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, return 0; } }; + + // Textures are laid out from bottom to top, hence we invert the t coordinate. + // NOTE: This may not be the right place for the inversion. + // TODO: Check if this applies to ETC textures, too. s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width); t = texture.config.height - 1 - GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height); @@ -262,7 +308,9 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, auto GetSource = [&](Source source) -> Math::Vec4<u8> { switch (source) { + // TODO: What's the difference between these two? case Source::PrimaryColor: + case Source::PrimaryFragmentColor: return primary_color; case Source::Texture0: @@ -378,6 +426,25 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, return result.Cast<u8>(); } + case Operation::MultiplyThenAdd: + { + auto result = (input[0] * input[1] + 255 * input[2].Cast<int>()) / 255; + result.r() = std::min(255, result.r()); + result.g() = std::min(255, result.g()); + result.b() = std::min(255, result.b()); + return result.Cast<u8>(); + } + + case Operation::AddThenMultiply: + { + auto result = input[0] + input[1]; + result.r() = std::min(255, result.r()); + result.g() = std::min(255, result.g()); + result.b() = std::min(255, result.b()); + result = (result * input[2].Cast<int>()) / 255; + return result.Cast<u8>(); + } + default: LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op); UNIMPLEMENTED(); @@ -402,6 +469,12 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, case Operation::Subtract: return std::max(0, (int)input[0] - (int)input[1]); + case Operation::MultiplyThenAdd: + return std::min(255, (input[0] * input[1] + 255 * input[2]) / 255); + + case Operation::AddThenMultiply: + return (std::min(255, (input[0] + input[1])) * input[2]) / 255; + default: LOG_ERROR(HW_GPU, "Unknown alpha combiner operation %d\n", (int)op); UNIMPLEMENTED(); @@ -475,7 +548,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, // TODO: Does depth indeed only get written even if depth testing is enabled? if (registers.output_merger.depth_test_enable) { - u16 z = (u16)(-(v0.screenpos[2].ToFloat32() * w0 + + u16 z = (u16)((v0.screenpos[2].ToFloat32() * w0 + v1.screenpos[2].ToFloat32() * w1 + v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum); u16 ref_z = GetDepth(x >> 4, y >> 4); @@ -524,6 +597,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, } auto dest = GetPixel(x >> 4, y >> 4); + Math::Vec4<u8> blend_output = combiner_output; if (registers.output_merger.alphablend_enable) { auto params = registers.output_merger.alpha_blending; @@ -574,7 +648,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, default: LOG_CRITICAL(HW_GPU, "Unknown color blend factor %x", factor); - exit(0); + UNIMPLEMENTED(); break; } }; @@ -607,86 +681,78 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, default: LOG_CRITICAL(HW_GPU, "Unknown alpha blend factor %x", factor); - exit(0); + UNIMPLEMENTED(); + break; + } + }; + + using BlendEquation = decltype(params)::BlendEquation; + static auto EvaluateBlendEquation = [](const Math::Vec4<u8>& src, const Math::Vec4<u8>& srcfactor, + const Math::Vec4<u8>& dest, const Math::Vec4<u8>& destfactor, + BlendEquation equation) { + Math::Vec4<int> result; + + auto src_result = (src * srcfactor).Cast<int>(); + auto dst_result = (dest * destfactor).Cast<int>(); + + switch (equation) { + case BlendEquation::Add: + result = (src_result + dst_result) / 255; break; + + case BlendEquation::Subtract: + result = (src_result - dst_result) / 255; + break; + + case BlendEquation::ReverseSubtract: + result = (dst_result - src_result) / 255; + break; + + // TODO: How do these two actually work? + // OpenGL doesn't include the blend factors in the min/max computations, + // but is this what the 3DS actually does? + case BlendEquation::Min: + result.r() = std::min(src.r(), dest.r()); + result.g() = std::min(src.g(), dest.g()); + result.b() = std::min(src.b(), dest.b()); + result.a() = std::min(src.a(), dest.a()); + break; + + case BlendEquation::Max: + result.r() = std::max(src.r(), dest.r()); + result.g() = std::max(src.g(), dest.g()); + result.b() = std::max(src.b(), dest.b()); + result.a() = std::max(src.a(), dest.a()); + break; + + default: + LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", equation); + UNIMPLEMENTED(); } + + return Math::Vec4<u8>(MathUtil::Clamp(result.r(), 0, 255), + MathUtil::Clamp(result.g(), 0, 255), + MathUtil::Clamp(result.b(), 0, 255), + MathUtil::Clamp(result.a(), 0, 255)); }; auto srcfactor = Math::MakeVec(LookupFactorRGB(params.factor_source_rgb), LookupFactorA(params.factor_source_a)); auto dstfactor = Math::MakeVec(LookupFactorRGB(params.factor_dest_rgb), LookupFactorA(params.factor_dest_a)); - - auto src_result = (combiner_output * srcfactor).Cast<int>(); - auto dst_result = (dest * dstfactor).Cast<int>(); - - switch (params.blend_equation_rgb) { - case params.Add: - { - auto result = (src_result + dst_result) / 255; - result.r() = std::min(255, result.r()); - result.g() = std::min(255, result.g()); - result.b() = std::min(255, result.b()); - combiner_output = result.Cast<u8>(); - break; - } - - case params.Subtract: - { - auto result = (src_result - dst_result) / 255; - result.r() = std::max(0, result.r()); - result.g() = std::max(0, result.g()); - result.b() = std::max(0, result.b()); - combiner_output = result.Cast<u8>(); - break; - } - - case params.ReverseSubtract: - { - auto result = (dst_result - src_result) / 255; - result.r() = std::max(0, result.r()); - result.g() = std::max(0, result.g()); - result.b() = std::max(0, result.b()); - combiner_output = result.Cast<u8>(); - break; - } - - case params.Min: - { - // TODO: GL spec says to do it without the factors, but is this what the 3DS does? - Math::Vec4<int> result; - result.r() = std::min(combiner_output.r(),dest.r()); - result.g() = std::min(combiner_output.g(),dest.g()); - result.b() = std::min(combiner_output.b(),dest.b()); - combiner_output = result.Cast<u8>(); - break; - } - - case params.Max: - { - // TODO: GL spec says to do it without the factors, but is this what the 3DS does? - Math::Vec4<int> result; - result.r() = std::max(combiner_output.r(),dest.r()); - result.g() = std::max(combiner_output.g(),dest.g()); - result.b() = std::max(combiner_output.b(),dest.b()); - combiner_output = result.Cast<u8>(); - break; - } - default: - LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", params.blend_equation_rgb.Value()); - exit(0); - } + blend_output = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_rgb); + blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_a).a(); } else { LOG_CRITICAL(HW_GPU, "logic op: %x", registers.output_merger.logic_op); - exit(0); + UNIMPLEMENTED(); } const Math::Vec4<u8> result = { - registers.output_merger.red_enable ? combiner_output.r() : dest.r(), - registers.output_merger.green_enable ? combiner_output.g() : dest.g(), - registers.output_merger.blue_enable ? combiner_output.b() : dest.b(), - registers.output_merger.alpha_enable ? combiner_output.a() : dest.a() + registers.output_merger.red_enable ? blend_output.r() : dest.r(), + registers.output_merger.green_enable ? blend_output.g() : dest.g(), + registers.output_merger.blue_enable ? blend_output.b() : dest.b(), + registers.output_merger.alpha_enable ? blend_output.a() : dest.a() }; DrawPixel(x >> 4, y >> 4, result); @@ -694,6 +760,12 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, } } +void ProcessTriangle(const VertexShader::OutputVertex& v0, + const VertexShader::OutputVertex& v1, + const VertexShader::OutputVertex& v2) { + ProcessTriangleInternal(v0, v1, v2); +} + } // namespace Rasterizer } // namespace Pica diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 735c0cf45..272695174 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -61,15 +61,13 @@ void RendererOpenGL::SwapBuffers() { for(int i : {0, 1}) { const auto& framebuffer = GPU::g_regs.framebuffer_config[i]; - if (textures[i].width != (GLsizei)framebuffer.width || textures[i].height != (GLsizei)framebuffer.height) { + if (textures[i].width != (GLsizei)framebuffer.width || + textures[i].height != (GLsizei)framebuffer.height || + textures[i].format != framebuffer.color_format) { // Reallocate texture if the framebuffer size has changed. // This is expected to not happen very often and hence should not be a // performance problem. - glBindTexture(GL_TEXTURE_2D, textures[i].handle); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, framebuffer.width, framebuffer.height, 0, - GL_BGR, GL_UNSIGNED_BYTE, nullptr); - textures[i].width = framebuffer.width; - textures[i].height = framebuffer.height; + ConfigureFramebufferTexture(textures[i], framebuffer); } LoadFBToActiveGLTexture(GPU::g_regs.framebuffer_config[i], textures[i]); @@ -98,13 +96,12 @@ void RendererOpenGL::LoadFBToActiveGLTexture(const GPU::Regs::FramebufferConfig& const u8* framebuffer_data = Memory::GetPointer(framebuffer_vaddr); - // TODO: Handle other pixel formats - ASSERT_MSG(framebuffer.color_format == GPU::Regs::PixelFormat::RGB8, - "Unsupported 3DS pixel format."); + int bpp = GPU::Regs::BytesPerPixel(framebuffer.color_format); + size_t pixel_stride = framebuffer.stride / bpp; - size_t pixel_stride = framebuffer.stride / 3; // OpenGL only supports specifying a stride in units of pixels, not bytes, unfortunately - ASSERT(pixel_stride * 3 == framebuffer.stride); + ASSERT(pixel_stride * bpp == framebuffer.stride); + // Ensure no bad interactions with GL_UNPACK_ALIGNMENT, which by default // only allows rows to have a memory alignement of 4. ASSERT(pixel_stride % 4 == 0); @@ -118,7 +115,7 @@ void RendererOpenGL::LoadFBToActiveGLTexture(const GPU::Regs::FramebufferConfig& // TODO: Applications could theoretically crash Citra here by specifying too large // framebuffer sizes. We should make sure that this cannot happen. glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, framebuffer.width, framebuffer.height, - GL_BGR, GL_UNSIGNED_BYTE, framebuffer_data); + texture.gl_format, texture.gl_type, framebuffer_data); glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); @@ -171,6 +168,59 @@ void RendererOpenGL::InitOpenGLObjects() { glBindTexture(GL_TEXTURE_2D, 0); } +void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, + const GPU::Regs::FramebufferConfig& framebuffer) { + GPU::Regs::PixelFormat format = framebuffer.color_format; + GLint internal_format; + + texture.format = format; + texture.width = framebuffer.width; + texture.height = framebuffer.height; + + switch (format) { + case GPU::Regs::PixelFormat::RGBA8: + internal_format = GL_RGBA; + texture.gl_format = GL_RGBA; + texture.gl_type = GL_UNSIGNED_INT_8_8_8_8; + break; + + case GPU::Regs::PixelFormat::RGB8: + // This pixel format uses BGR since GL_UNSIGNED_BYTE specifies byte-order, unlike every + // specific OpenGL type used in this function using native-endian (that is, little-endian + // mostly everywhere) for words or half-words. + // TODO: check how those behave on big-endian processors. + internal_format = GL_RGB; + texture.gl_format = GL_BGR; + texture.gl_type = GL_UNSIGNED_BYTE; + break; + + case GPU::Regs::PixelFormat::RGB565: + internal_format = GL_RGB; + texture.gl_format = GL_RGB; + texture.gl_type = GL_UNSIGNED_SHORT_5_6_5; + break; + + case GPU::Regs::PixelFormat::RGB5A1: + internal_format = GL_RGBA; + texture.gl_format = GL_RGBA; + texture.gl_type = GL_UNSIGNED_SHORT_5_5_5_1; + break; + + case GPU::Regs::PixelFormat::RGBA4: + internal_format = GL_RGBA; + texture.gl_format = GL_RGBA; + texture.gl_type = GL_UNSIGNED_SHORT_4_4_4_4; + break; + + default: + UNIMPLEMENTED(); + } + + glBindTexture(GL_TEXTURE_2D, texture.handle); + glTexImage2D(GL_TEXTURE_2D, 0, internal_format, texture.width, texture.height, 0, + texture.gl_format, texture.gl_type, nullptr); +} + /** * Draws a single texture to the emulator window, rotating the texture to correct for the 3DS's LCD rotation. */ diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index cf78c1e77..bcabab557 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -43,9 +43,14 @@ private: GLuint handle; GLsizei width; GLsizei height; + GPU::Regs::PixelFormat format; + GLenum gl_format; + GLenum gl_type; }; void InitOpenGLObjects(); + static void ConfigureFramebufferTexture(TextureInfo& texture, + const GPU::Regs::FramebufferConfig& framebuffer); void DrawScreens(); void DrawSingleScreenRotated(const TextureInfo& texture, float x, float y, float w, float h); void UpdateFramerate(); diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp index 80935a50a..def868ac7 100644 --- a/src/video_core/vertex_shader.cpp +++ b/src/video_core/vertex_shader.cpp @@ -85,8 +85,11 @@ struct VertexShaderState { }; struct CallStackElement { - u32 final_address; - u32 return_address; + u32 final_address; // Address upon which we jump to return_address + u32 return_address; // Where to jump when leaving scope + u8 repeat_counter; // How often to repeat until this call stack element is removed + u8 loop_increment; // Which value to add to the loop counter after an iteration + // TODO: Should this be a signed value? Does it even matter? }; // TODO: Is there a maximal size for this? @@ -105,9 +108,14 @@ static void ProcessShaderCode(VertexShaderState& state) { while (true) { if (!state.call_stack.empty()) { - if (state.program_counter - shader_memory.data() == state.call_stack.top().final_address) { - state.program_counter = &shader_memory[state.call_stack.top().return_address]; - state.call_stack.pop(); + auto& top = state.call_stack.top(); + if (state.program_counter - shader_memory.data() == top.final_address) { + state.address_registers[2] += top.loop_increment; + + if (top.repeat_counter-- == 0) { + state.program_counter = &shader_memory[top.return_address]; + state.call_stack.pop(); + } // TODO: Is "trying again" accurate to hardware? continue; @@ -118,9 +126,10 @@ static void ProcessShaderCode(VertexShaderState& state) { const Instruction& instr = *(const Instruction*)state.program_counter; const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id]; - auto call = [&](VertexShaderState& state, u32 offset, u32 num_instructions, u32 return_offset) { + static auto call = [](VertexShaderState& state, u32 offset, u32 num_instructions, + u32 return_offset, u8 repeat_count, u8 loop_increment) { state.program_counter = &shader_memory[offset] - 1; // -1 to make sure when incrementing the PC we end up at the correct offset - state.call_stack.push({ offset + num_instructions, return_offset }); + state.call_stack.push({ offset + num_instructions, return_offset, repeat_count, loop_increment }); }; u32 binary_offset = state.program_counter - shader_memory.data(); @@ -457,7 +466,7 @@ static void ProcessShaderCode(VertexShaderState& state) { call(state, instr.flow_control.dest_offset, instr.flow_control.num_instructions, - binary_offset + 1); + binary_offset + 1, 0, 0); break; case Instruction::OpCode::CALLU: @@ -465,7 +474,7 @@ static void ProcessShaderCode(VertexShaderState& state) { call(state, instr.flow_control.dest_offset, instr.flow_control.num_instructions, - binary_offset + 1); + binary_offset + 1, 0, 0); } break; @@ -474,7 +483,7 @@ static void ProcessShaderCode(VertexShaderState& state) { call(state, instr.flow_control.dest_offset, instr.flow_control.num_instructions, - binary_offset + 1); + binary_offset + 1, 0, 0); } break; @@ -486,12 +495,12 @@ static void ProcessShaderCode(VertexShaderState& state) { call(state, binary_offset + 1, instr.flow_control.dest_offset - binary_offset - 1, - instr.flow_control.dest_offset + instr.flow_control.num_instructions); + instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); } else { call(state, instr.flow_control.dest_offset, instr.flow_control.num_instructions, - instr.flow_control.dest_offset + instr.flow_control.num_instructions); + instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); } break; @@ -504,17 +513,30 @@ static void ProcessShaderCode(VertexShaderState& state) { call(state, binary_offset + 1, instr.flow_control.dest_offset - binary_offset - 1, - instr.flow_control.dest_offset + instr.flow_control.num_instructions); + instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); } else { call(state, instr.flow_control.dest_offset, instr.flow_control.num_instructions, - instr.flow_control.dest_offset + instr.flow_control.num_instructions); + instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); } break; } + case Instruction::OpCode::LOOP: + { + state.address_registers[2] = shader_uniforms.i[instr.flow_control.int_uniform_id].y; + + call(state, + binary_offset + 1, + instr.flow_control.dest_offset - binary_offset + 1, + instr.flow_control.dest_offset + 1, + shader_uniforms.i[instr.flow_control.int_uniform_id].x, + shader_uniforms.i[instr.flow_control.int_uniform_id].z); + break; + } + default: LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x", (int)instr.opcode.Value(), instr.opcode.GetInfo().name, instr.hex); |