summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/video_core/pica.h14
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp4
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer_cache.cpp18
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer_cache.h4
-rw-r--r--src/video_core/shader/shader_interpreter.cpp10
-rw-r--r--src/video_core/shader/shader_jit_x64.cpp131
-rw-r--r--src/video_core/shader/shader_jit_x64.h6
7 files changed, 120 insertions, 67 deletions
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 58b924f9e..bb689f2a9 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -1021,12 +1021,20 @@ struct float24 {
return ret;
}
+ static float24 Zero() {
+ return FromFloat32(0.f);
+ }
+
// Not recommended for anything but logging
float ToFloat32() const {
return value;
}
float24 operator * (const float24& flt) const {
+ if ((this->value == 0.f && !std::isnan(flt.value)) ||
+ (flt.value == 0.f && !std::isnan(this->value)))
+ // PICA gives 0 instead of NaN when multiplying by inf
+ return Zero();
return float24::FromFloat32(ToFloat32() * flt.ToFloat32());
}
@@ -1043,7 +1051,11 @@ struct float24 {
}
float24& operator *= (const float24& flt) {
- value *= flt.ToFloat32();
+ if ((this->value == 0.f && !std::isnan(flt.value)) ||
+ (flt.value == 0.f && !std::isnan(this->value)))
+ // PICA gives 0 instead of NaN when multiplying by inf
+ *this = Zero();
+ else value *= flt.ToFloat32();
return *this;
}
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index deb9971bb..f0ccc2397 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -231,8 +231,8 @@ void RasterizerOpenGL::DrawTriangles() {
u32 cur_fb_depth_size = Pica::Regs::BytesPerDepthPixel(regs.framebuffer.depth_format)
* regs.framebuffer.GetWidth() * regs.framebuffer.GetHeight();
- res_cache.NotifyFlush(cur_fb_color_addr, cur_fb_color_size);
- res_cache.NotifyFlush(cur_fb_depth_addr, cur_fb_depth_size);
+ res_cache.NotifyFlush(cur_fb_color_addr, cur_fb_color_size, true);
+ res_cache.NotifyFlush(cur_fb_depth_addr, cur_fb_depth_size, true);
}
void RasterizerOpenGL::CommitFramebuffer() {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index e4247051c..1e38c2e6d 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -2,6 +2,7 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
+#include "common/hash.h"
#include "common/make_unique.h"
#include "common/math_util.h"
#include "common/microprofile.h"
@@ -21,7 +22,6 @@ MICROPROFILE_DEFINE(OpenGL_TextureUpload, "OpenGL", "Texture Upload", MP_RGB(128
void RasterizerCacheOpenGL::LoadAndBindTexture(OpenGLState &state, unsigned texture_unit, const Pica::Regs::FullTextureConfig& config) {
PAddr texture_addr = config.config.GetPhysicalAddress();
-
const auto cached_texture = texture_cache.find(texture_addr);
if (cached_texture != texture_cache.end()) {
@@ -51,12 +51,14 @@ void RasterizerCacheOpenGL::LoadAndBindTexture(OpenGLState &state, unsigned text
}
const auto info = Pica::DebugUtils::TextureInfo::FromPicaRegister(config.config, config.format);
+ u8* texture_src_data = Memory::GetPhysicalPointer(texture_addr);
new_texture->width = info.width;
new_texture->height = info.height;
- new_texture->size = info.width * info.height * Pica::Regs::NibblesPerPixel(info.format);
+ new_texture->size = info.stride * info.height;
+ new_texture->addr = texture_addr;
+ new_texture->hash = Common::ComputeHash64(texture_src_data, new_texture->size);
- u8* texture_src_data = Memory::GetPhysicalPointer(texture_addr);
std::unique_ptr<Math::Vec4<u8>[]> temp_texture_buffer_rgba(new Math::Vec4<u8>[info.width * info.height]);
for (int y = 0; y < info.height; ++y) {
@@ -71,12 +73,18 @@ void RasterizerCacheOpenGL::LoadAndBindTexture(OpenGLState &state, unsigned text
}
}
-void RasterizerCacheOpenGL::NotifyFlush(PAddr addr, u32 size) {
+void RasterizerCacheOpenGL::NotifyFlush(PAddr addr, u32 size, bool ignore_hash) {
// Flush any texture that falls in the flushed region
// TODO: Optimize by also inserting upper bound (addr + size) of each texture into the same map and also narrow using lower_bound
auto cache_upper_bound = texture_cache.upper_bound(addr + size);
+
for (auto it = texture_cache.begin(); it != cache_upper_bound;) {
- if (MathUtil::IntervalsIntersect(addr, size, it->first, it->second->size)) {
+ const auto& info = *it->second;
+
+ // Flush the texture only if the memory region intersects and a change is detected
+ if (MathUtil::IntervalsIntersect(addr, size, info.addr, info.size) &&
+ (ignore_hash || info.hash != Common::ComputeHash64(Memory::GetPhysicalPointer(info.addr), info.size))) {
+
it = texture_cache.erase(it);
} else {
++it;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index 96f3a925c..d8f9edf59 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -19,7 +19,7 @@ public:
void LoadAndBindTexture(OpenGLState &state, unsigned texture_unit, const Pica::Regs::FullTextureConfig& config);
/// Flush any cached resource that touches the flushed region
- void NotifyFlush(PAddr addr, u32 size);
+ void NotifyFlush(PAddr addr, u32 size, bool ignore_hash = false);
/// Flush all cached OpenGL resources tracked by this cache manager
void FullFlush();
@@ -30,6 +30,8 @@ private:
GLuint width;
GLuint height;
u32 size;
+ u64 hash;
+ PAddr addr;
};
std::map<PAddr, std::unique_ptr<CachedTexture>> texture_cache;
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index ae5a30441..69e4efa68 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -177,7 +177,10 @@ void RunInterpreter(UnitState<Debug>& state) {
if (!swizzle.DestComponentEnabled(i))
continue;
- dest[i] = std::max(src1[i], src2[i]);
+ // NOTE: Exact form required to match NaN semantics to hardware:
+ // max(0, NaN) -> NaN
+ // max(NaN, 0) -> 0
+ dest[i] = (src1[i] > src2[i]) ? src1[i] : src2[i];
}
Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
@@ -190,7 +193,10 @@ void RunInterpreter(UnitState<Debug>& state) {
if (!swizzle.DestComponentEnabled(i))
continue;
- dest[i] = std::min(src1[i], src2[i]);
+ // NOTE: Exact form required to match NaN semantics to hardware:
+ // min(0, NaN) -> NaN
+ // min(NaN, 0) -> 0
+ dest[i] = (src1[i] < src2[i]) ? src1[i] : src2[i];
}
Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break;
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index cc66fc8d6..d3cfe109e 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -115,6 +115,8 @@ static const X64Reg SRC1 = XMM1;
static const X64Reg SRC2 = XMM2;
/// Loaded with the third swizzled source register, otherwise can be used as a scratch register
static const X64Reg SRC3 = XMM3;
+/// Additional scratch register
+static const X64Reg SCRATCH2 = XMM4;
/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one
static const X64Reg ONE = XMM14;
/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
@@ -227,8 +229,8 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
BLENDPS(SCRATCH, R(src), mask);
} else {
- MOVAPS(XMM4, R(src));
- UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination
+ MOVAPS(SCRATCH2, R(src));
+ UNPCKHPS(SCRATCH2, R(SCRATCH)); // Unpack X/Y components of source and destination
UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination
// Compute selector to selectively copy source components to destination for SHUFPS instruction
@@ -236,7 +238,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) |
((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) |
((swiz.DestComponentEnabled(3) ? 2 : 3) << 6);
- SHUFPS(SCRATCH, R(XMM4), sel);
+ SHUFPS(SCRATCH, R(SCRATCH2), sel);
}
// Store dest back to memory
@@ -244,6 +246,19 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
}
}
+void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) {
+ MOVAPS(scratch, R(src1));
+ CMPPS(scratch, R(src2), CMP_ORD);
+
+ MULPS(src1, R(src2));
+
+ MOVAPS(src2, R(src1));
+ CMPPS(src2, R(src2), CMP_UNORD);
+
+ XORPS(scratch, R(src2));
+ ANDPS(src1, R(scratch));
+}
+
void JitCompiler::Compile_EvaluateCondition(Instruction instr) {
// Note: NXOR is used below to check for equality
switch (instr.flow_control.op) {
@@ -307,21 +322,17 @@ void JitCompiler::Compile_DP3(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
- if (Common::GetCPUCaps().sse4_1) {
- DPPS(SRC1, R(SRC2), 0x7f);
- } else {
- MULPS(SRC1, R(SRC2));
+ Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
- MOVAPS(SRC2, R(SRC1));
- SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1));
+ MOVAPS(SRC2, R(SRC1));
+ SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1));
- MOVAPS(SRC3, R(SRC1));
- SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2));
+ MOVAPS(SRC3, R(SRC1));
+ SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2));
- SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0));
- ADDPS(SRC1, R(SRC2));
- ADDPS(SRC1, R(SRC3));
- }
+ SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0));
+ ADDPS(SRC1, R(SRC2));
+ ADDPS(SRC1, R(SRC3));
Compile_DestEnable(instr, SRC1);
}
@@ -330,19 +341,15 @@ void JitCompiler::Compile_DP4(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
- if (Common::GetCPUCaps().sse4_1) {
- DPPS(SRC1, R(SRC2), 0xff);
- } else {
- MULPS(SRC1, R(SRC2));
+ Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
- MOVAPS(SRC2, R(SRC1));
- SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
- ADDPS(SRC1, R(SRC2));
+ MOVAPS(SRC2, R(SRC1));
+ SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
+ ADDPS(SRC1, R(SRC2));
- MOVAPS(SRC2, R(SRC1));
- SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
- ADDPS(SRC1, R(SRC2));
- }
+ MOVAPS(SRC2, R(SRC1));
+ SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
+ ADDPS(SRC1, R(SRC2));
Compile_DestEnable(instr, SRC1);
}
@@ -359,23 +366,22 @@ void JitCompiler::Compile_DPH(Instruction instr) {
if (Common::GetCPUCaps().sse4_1) {
// Set 4th component to 1.0
BLENDPS(SRC1, R(ONE), 0x8); // 0b1000
- DPPS(SRC1, R(SRC2), 0xff);
} else {
- // Reverse to set the 4th component to 1.0
- SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
- MOVSS(SRC1, R(ONE));
- SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
+ // Set 4th component to 1.0
+ MOVAPS(SCRATCH, R(SRC1));
+ UNPCKHPS(SCRATCH, R(ONE)); // XYZW, 1111 -> Z1__
+ UNPCKLPD(SRC1, R(SCRATCH)); // XYZW, Z1__ -> XYZ1
+ }
- MULPS(SRC1, R(SRC2));
+ Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
- MOVAPS(SRC2, R(SRC1));
- SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
- ADDPS(SRC1, R(SRC2));
+ MOVAPS(SRC2, R(SRC1));
+ SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
+ ADDPS(SRC1, R(SRC2));
- MOVAPS(SRC2, R(SRC1));
- SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
- ADDPS(SRC1, R(SRC2));
- }
+ MOVAPS(SRC2, R(SRC1));
+ SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
+ ADDPS(SRC1, R(SRC2));
Compile_DestEnable(instr, SRC1);
}
@@ -415,7 +421,7 @@ void JitCompiler::Compile_LG2(Instruction instr) {
void JitCompiler::Compile_MUL(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
- MULPS(SRC1, R(SRC2));
+ Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
Compile_DestEnable(instr, SRC1);
}
@@ -465,6 +471,7 @@ void JitCompiler::Compile_FLR(Instruction instr) {
void JitCompiler::Compile_MAX(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
MAXPS(SRC1, R(SRC2));
Compile_DestEnable(instr, SRC1);
}
@@ -472,6 +479,7 @@ void JitCompiler::Compile_MAX(Instruction instr) {
void JitCompiler::Compile_MIN(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
MINPS(SRC1, R(SRC2));
Compile_DestEnable(instr, SRC1);
}
@@ -485,8 +493,8 @@ void JitCompiler::Compile_MOVA(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
- // Convert floats to integers (only care about X and Y components)
- CVTPS2DQ(SRC1, R(SRC1));
+ // Convert floats to integers using truncation (only care about X and Y components)
+ CVTTPS2DQ(SRC1, R(SRC1));
// Get result
MOVQ_xmm(R(RAX), SRC1);
@@ -578,27 +586,42 @@ void JitCompiler::Compile_CALLU(Instruction instr) {
}
void JitCompiler::Compile_CMP(Instruction instr) {
+ using Op = Instruction::Common::CompareOpType::Op;
+ Op op_x = instr.common.compare_op.x;
+ Op op_y = instr.common.compare_op.y;
+
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
- static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT };
+ // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to
+ // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here
+ // because they don't match when used with NaNs.
+ static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE };
+
+ bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual);
+ Gen::X64Reg lhs_x = invert_op_x ? SRC2 : SRC1;
+ Gen::X64Reg rhs_x = invert_op_x ? SRC1 : SRC2;
- if (instr.common.compare_op.x == instr.common.compare_op.y) {
+ if (op_x == op_y) {
// Compare X-component and Y-component together
- CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]);
+ CMPPS(lhs_x, R(rhs_x), cmp[op_x]);
+ MOVQ_xmm(R(COND0), lhs_x);
- MOVQ_xmm(R(COND0), SRC1);
MOV(64, R(COND1), R(COND0));
} else {
+ bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual);
+ Gen::X64Reg lhs_y = invert_op_y ? SRC2 : SRC1;
+ Gen::X64Reg rhs_y = invert_op_y ? SRC1 : SRC2;
+
// Compare X-component
- MOVAPS(SCRATCH, R(SRC1));
- CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]);
+ MOVAPS(SCRATCH, R(lhs_x));
+ CMPSS(SCRATCH, R(rhs_x), cmp[op_x]);
// Compare Y-component
- CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]);
+ CMPPS(lhs_y, R(rhs_y), cmp[op_y]);
MOVQ_xmm(R(COND0), SCRATCH);
- MOVQ_xmm(R(COND1), SRC1);
+ MOVQ_xmm(R(COND1), lhs_y);
}
SHR(32, R(COND0), Imm8(31));
@@ -616,12 +639,8 @@ void JitCompiler::Compile_MAD(Instruction instr) {
Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);
}
- if (Common::GetCPUCaps().fma) {
- VFMADD213PS(SRC1, SRC2, R(SRC3));
- } else {
- MULPS(SRC1, R(SRC2));
- ADDPS(SRC1, R(SRC3));
- }
+ Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
+ ADDPS(SRC1, R(SRC3));
Compile_DestEnable(instr, SRC1);
}
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
index fbe19fe93..58828ecc8 100644
--- a/src/video_core/shader/shader_jit_x64.h
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -68,6 +68,12 @@ private:
void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest);
void Compile_DestEnable(Instruction instr, Gen::X64Reg dest);
+ /**
+ * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying
+ * zero by inf. Clobbers `src2` and `scratch`.
+ */
+ void Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch);
+
void Compile_EvaluateCondition(Instruction instr);
void Compile_UniformCondition(Instruction instr);