From b2f472a2b1f36073b0070b81c08a666380ad180d Mon Sep 17 00:00:00 2001
From: Subv <subv2112@gmail.com>
Date: Fri, 9 Jun 2017 13:14:55 -0500
Subject: SwRasterizer: Implement primary fragment color.

---
 src/video_core/swrasterizer/rasterizer.cpp | 117 ++++++++++++++++++++++++++++-
 1 file changed, 113 insertions(+), 4 deletions(-)

(limited to 'src/video_core')
diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index 512e81c08..1ab41c2df 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -13,6 +13,7 @@
 #include "common/logging/log.h"
 #include "common/math_util.h"
 #include "common/microprofile.h"
+#include "common/quaternion.h"
 #include "common/vector_math.h"
 #include "core/hw/gpu.h"
 #include "core/memory.h"
@@ -114,6 +115,86 @@ static std::tuple<float24, float24, PAddr> ConvertCubeCoord(float24 u, float24 v
     return std::make_tuple(x / z * half + half, y / z * half + half, addr);
 }
 
+std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view) {
+    const auto& lighting = g_state.regs.lighting;
+
+    if (lighting.disable)
+        return {{}, {}};
+
+    // TODO(Subv): Bump mapping
+    Math::Vec3<float> surface_normal = {0.0f, 0.0f, 1.0f};
+
+    if (lighting.config0.bump_mode != LightingRegs::LightingBumpMode::None) {
+        LOG_CRITICAL(HW_GPU, "unimplemented bump mapping");
+        UNIMPLEMENTED();
+    }
+
+    // TODO(Subv): Do we need to normalize the quaternion here?
+    auto normal = Math::QuaternionRotate(normquat, surface_normal);
+
+    Math::Vec3<float> light_vector = {};
+    Math::Vec3<float> diffuse_sum = {};
+    // TODO(Subv): Calculate specular
+    Math::Vec3<float> specular_sum = {};
+
+    for (unsigned light_index = 0; light_index <= lighting.max_light_index; ++light_index) {
+        unsigned num = lighting.light_enable.GetNum(light_index);
+        const auto& light_config = g_state.regs.lighting.light[num];
+
+        Math::Vec3<float> position = {float16::FromRaw(light_config.x).ToFloat32(), float16::FromRaw(light_config.y).ToFloat32(), float16::FromRaw(light_config.z).ToFloat32()};
+
+        if (light_config.config.directional)
+            light_vector = position;
+        else
+            light_vector = position + view;
+
+        light_vector.Normalize();
+
+        auto dot_product = Math::Dot(light_vector, normal);
+
+        if (light_config.config.two_sided_diffuse)
+            dot_product = std::abs(dot_product);
+        else
+            dot_product = std::max(dot_product, 0.0f);
+
+        float dist_atten = 1.0f;
+        if (!lighting.IsDistAttenDisabled(num)) {
+            auto distance = (-view - position).Length();
+            float scale = Pica::float20::FromRaw(light_config.dist_atten_scale).ToFloat32();
+            float bias = Pica::float20::FromRaw(light_config.dist_atten_scale).ToFloat32();
+            size_t lut = static_cast<size_t>(LightingRegs::LightingSampler::DistanceAttenuation) + num;
+
+            float sample_loc = scale * distance + bias;
+            unsigned index_i = static_cast<unsigned>(MathUtil::Clamp(floor(sample_loc * 256), 0.0f, 1.0f));
+
+            float index_f = sample_loc - index_i;
+
+            ASSERT_MSG(lut < g_state.lighting.luts.size(), "Out of range lut");
+
+            float lut_value = g_state.lighting.luts[lut][index_i].ToFloat();
+            float lut_diff = g_state.lighting.luts[lut][index_i].DiffToFloat();
+
+            dist_atten = lut_value + lut_diff * index_f;
+        }
+
+        auto diffuse = light_config.diffuse.ToVec3f() * dot_product + light_config.ambient.ToVec3f();
+        diffuse_sum += diffuse * dist_atten;
+    }
+
+    diffuse_sum += lighting.global_ambient.ToVec3f();
+    return {
+        Math::MakeVec<float>(MathUtil::Clamp(diffuse_sum.x, 0.0f, 1.0f) * 255, MathUtil::Clamp(diffuse_sum.y, 0.0f, 1.0f) * 255, MathUtil::Clamp(diffuse_sum.z, 0.0f, 1.0f) * 255, 255).Cast<u8>(),
+        Math::MakeVec<float>(MathUtil::Clamp(specular_sum.x, 0.0f, 1.0f) * 255, MathUtil::Clamp(specular_sum.y, 0.0f, 1.0f) * 255, MathUtil::Clamp(specular_sum.z, 0.0f, 1.0f) * 255, 255).Cast<u8>()
+    };
+}
+
+static bool AreQuaternionsOpposite(Math::Vec4<Pica::float24> qa, Math::Vec4<Pica::float24> qb) {
+    Math::Vec4f a{ qa.x.ToFloat32(), qa.y.ToFloat32(), qa.z.ToFloat32(), qa.w.ToFloat32() };
+    Math::Vec4f b{ qb.x.ToFloat32(), qb.y.ToFloat32(), qb.z.ToFloat32(), qb.w.ToFloat32() };
+
+    return (Math::Dot(a, b) < 0.f);
+}
+
 MICROPROFILE_DEFINE(GPU_Rasterization, "GPU", "Rasterization", MP_RGB(50, 50, 240));
 
 /**
@@ -207,6 +288,15 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
     int bias2 =
         IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0;
 
+    // Flip the quaternions if they are opposite to prevent interpolating them over the wrong direction.
+    auto v1_quat = v1.quat;
+    auto v2_quat = v2.quat;
+
+    if (AreQuaternionsOpposite(v0.quat, v1.quat))
+        v1_quat = v1_quat * float24::FromFloat32(-1.0f);
+    if (AreQuaternionsOpposite(v0.quat, v2.quat))
+        v2_quat = v2_quat * float24::FromFloat32(-1.0f);
+
     auto w_inverse = Math::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w);
 
     auto textures = regs.texturing.GetTextures();
@@ -305,6 +395,21 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
                     255),
             };
 
+            Math::Quaternion<float> normquat{
+                {
+                    GetInterpolatedAttribute(v0.quat.x, v1_quat.x, v2_quat.x).ToFloat32(),
+                    GetInterpolatedAttribute(v0.quat.y, v1_quat.y, v2_quat.y).ToFloat32(),
+                    GetInterpolatedAttribute(v0.quat.z, v1_quat.z, v2_quat.z).ToFloat32()
+                },
+                GetInterpolatedAttribute(v0.quat.w, v1_quat.w, v2_quat.w).ToFloat32(),
+            };
+
+            Math::Vec3<float> fragment_position{
+                GetInterpolatedAttribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(),
+                GetInterpolatedAttribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(),
+                GetInterpolatedAttribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32()
+            };
+
             Math::Vec2<float24> uv[3];
             uv[0].u() = GetInterpolatedAttribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u());
             uv[0].v() = GetInterpolatedAttribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v());
@@ -419,6 +524,11 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
                 regs.texturing.tev_combiner_buffer_color.a,
             };
 
+            Math::Vec4<u8> primary_fragment_color;
+            Math::Vec4<u8> secondary_fragment_color;
+
+            std::tie(primary_fragment_color, secondary_fragment_color) = ComputeFragmentsColors(normquat, fragment_position);
+
             for (unsigned tev_stage_index = 0; tev_stage_index < tev_stages.size();
                  ++tev_stage_index) {
                 const auto& tev_stage = tev_stages[tev_stage_index];
@@ -427,14 +537,13 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
                 auto GetSource = [&](Source source) -> Math::Vec4<u8> {
                     switch (source) {
                     case Source::PrimaryColor:
+                        return primary_color;
 
-                    // HACK: Until we implement fragment lighting, use primary_color
                     case Source::PrimaryFragmentColor:
-                        return primary_color;
+                        return primary_fragment_color;
 
-                    // HACK: Until we implement fragment lighting, use zero
                     case Source::SecondaryFragmentColor:
-                        return {0, 0, 0, 0};
+                        return secondary_fragment_color;
 
                     case Source::Texture0:
                         return texture_color[0];
-- 
cgit v1.2.3


From be25e78b07140cb745387f757001dd04b3b4cc64 Mon Sep 17 00:00:00 2001
From: Subv <subv2112@gmail.com>
Date: Fri, 9 Jun 2017 14:25:41 -0500
Subject: SwRasterizer: Calculate specular_0 for fragment lighting.

---
 src/video_core/swrasterizer/rasterizer.cpp | 107 +++++++++++++++++++++++++----
 1 file changed, 94 insertions(+), 13 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index 1ab41c2df..34b84b0af 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -115,6 +115,20 @@ static std::tuple<float24, float24, PAddr> ConvertCubeCoord(float24 u, float24 v
     return std::make_tuple(x / z * half + half, y / z * half + half, addr);
 }
 
+
+float LookupLightingLut(size_t lut_index, float index) {
+    unsigned index_i = static_cast<unsigned>(MathUtil::Clamp(floor(index * 256), 0.0f, 1.0f));
+
+    float index_f = index - index_i;
+
+    ASSERT_MSG(lut_index < g_state.lighting.luts.size(), "Out of range lut");
+
+    float lut_value = g_state.lighting.luts[lut_index][index_i].ToFloat();
+    float lut_diff = g_state.lighting.luts[lut_index][index_i].DiffToFloat();
+
+    return lut_value + lut_diff * index_f;
+}
+
 std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view) {
     const auto& lighting = g_state.regs.lighting;
 
@@ -133,9 +147,9 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
     auto normal = Math::QuaternionRotate(normquat, surface_normal);
 
     Math::Vec3<float> light_vector = {};
-    Math::Vec3<float> diffuse_sum = {};
+    Math::Vec4<float> diffuse_sum = {0.f, 0.f, 0.f, 1.f};
     // TODO(Subv): Calculate specular
-    Math::Vec3<float> specular_sum = {};
+    Math::Vec4<float> specular_sum = {0.f, 0.f, 0.f, 1.f};
 
     for (unsigned light_index = 0; light_index <= lighting.max_light_index; ++light_index) {
         unsigned num = lighting.light_enable.GetNum(light_index);
@@ -150,7 +164,8 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
 
         light_vector.Normalize();
 
-        auto dot_product = Math::Dot(light_vector, normal);
+        auto LV_N = Math::Dot(light_vector, normal);
+        auto dot_product = LV_N;
 
         if (light_config.config.two_sided_diffuse)
             dot_product = std::abs(dot_product);
@@ -165,26 +180,92 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
             size_t lut = static_cast<size_t>(LightingRegs::LightingSampler::DistanceAttenuation) + num;
 
             float sample_loc = scale * distance + bias;
-            unsigned index_i = static_cast<unsigned>(MathUtil::Clamp(floor(sample_loc * 256), 0.0f, 1.0f));
+            dist_atten = LookupLightingLut(lut, sample_loc);
+        }
+
+        float clamp_highlights = 1.0f;
+
+        if (lighting.config0.clamp_highlights) {
+            if (LV_N <= 0.f)
+                clamp_highlights = 0.f;
+            else
+                clamp_highlights = 1.f;
+        }
+
+        auto GetLutIndex = [&](unsigned num, LightingRegs::LightingLutInput input,
+                              bool abs) -> float {
+
+            Math::Vec3<float> norm_view = view.Normalized();
+            Math::Vec3<float> half_angle = (norm_view + light_vector).Normalized();
+            float result = 0.0f;
+
+            switch (input) {
+            case LightingRegs::LightingLutInput::NH:
+                result = Math::Dot(normal, half_angle);
+                break;
 
-            float index_f = sample_loc - index_i;
+            case LightingRegs::LightingLutInput::VH:
+                result = Math::Dot(norm_view, half_angle);
+                break;
 
-            ASSERT_MSG(lut < g_state.lighting.luts.size(), "Out of range lut");
+            case LightingRegs::LightingLutInput::NV:
+                result = Math::Dot(normal, norm_view);
+                break;
 
-            float lut_value = g_state.lighting.luts[lut][index_i].ToFloat();
-            float lut_diff = g_state.lighting.luts[lut][index_i].DiffToFloat();
+            case LightingRegs::LightingLutInput::LN:
+                result = Math::Dot(light_vector, normal);
+                break;
 
-            dist_atten = lut_value + lut_diff * index_f;
+            default:
+                LOG_CRITICAL(HW_GPU, "Unknown lighting LUT input %d\n", (int)input);
+                UNIMPLEMENTED();
+                result = 0.f;
+            }
+
+            if (abs) {
+                if (light_config.config.two_sided_diffuse)
+                    result = std::abs(result);
+                else
+                    result = std::max(result, 0.0f);
+            } else {
+                if (result < 0.f)
+                    result += 2.f;
+
+                result /= 2.f;
+            }
+
+            return MathUtil::Clamp(result, 0.0f, 1.0f);
+        };
+
+        // Specular 0 component
+        float d0_lut_value = 1.0f;
+        if (lighting.config1.disable_lut_d0 == 0 &&
+            LightingRegs::IsLightingSamplerSupported(
+                lighting.config0.config, LightingRegs::LightingSampler::Distribution0)) {
+
+            // Lookup specular "distribution 0" LUT value
+            float index = GetLutIndex(num, lighting.lut_input.d0.Value(), lighting.abs_lut_input.disable_d0 == 0);
+
+            float scale = lighting.lut_scale.GetScale(lighting.lut_scale.d0);
+
+            d0_lut_value = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::Distribution0), index);
         }
 
+        Math::Vec3<float> specular_0 = d0_lut_value * light_config.specular_0.ToVec3f();
+
+        // TODO(Subv): Specular 1
+        Math::Vec3<float> specular_1 = {};
+
         auto diffuse = light_config.diffuse.ToVec3f() * dot_product + light_config.ambient.ToVec3f();
-        diffuse_sum += diffuse * dist_atten;
+        diffuse_sum += Math::MakeVec(diffuse * dist_atten, 0.0f);
+
+        specular_sum += Math::MakeVec((specular_0 + specular_1) * clamp_highlights * dist_atten, 0.f);
     }
 
-    diffuse_sum += lighting.global_ambient.ToVec3f();
+    diffuse_sum += Math::MakeVec(lighting.global_ambient.ToVec3f(), 0.0f);
     return {
-        Math::MakeVec<float>(MathUtil::Clamp(diffuse_sum.x, 0.0f, 1.0f) * 255, MathUtil::Clamp(diffuse_sum.y, 0.0f, 1.0f) * 255, MathUtil::Clamp(diffuse_sum.z, 0.0f, 1.0f) * 255, 255).Cast<u8>(),
-        Math::MakeVec<float>(MathUtil::Clamp(specular_sum.x, 0.0f, 1.0f) * 255, MathUtil::Clamp(specular_sum.y, 0.0f, 1.0f) * 255, MathUtil::Clamp(specular_sum.z, 0.0f, 1.0f) * 255, 255).Cast<u8>()
+        Math::MakeVec<float>(MathUtil::Clamp(diffuse_sum.x, 0.0f, 1.0f) * 255, MathUtil::Clamp(diffuse_sum.y, 0.0f, 1.0f) * 255, MathUtil::Clamp(diffuse_sum.z, 0.0f, 1.0f) * 255, MathUtil::Clamp(diffuse_sum.w, 0.0f, 1.0f) * 255).Cast<u8>(),
+        Math::MakeVec<float>(MathUtil::Clamp(specular_sum.x, 0.0f, 1.0f) * 255, MathUtil::Clamp(specular_sum.y, 0.0f, 1.0f) * 255, MathUtil::Clamp(specular_sum.z, 0.0f, 1.0f) * 255, MathUtil::Clamp(specular_sum.w, 0.0f, 1.0f) * 255).Cast<u8>()
     };
 }
 
-- 
cgit v1.2.3


From 46b8c8e1da6bc29df2662d63b0e028136fef3636 Mon Sep 17 00:00:00 2001
From: Subv <subv2112@gmail.com>
Date: Fri, 9 Jun 2017 14:44:02 -0500
Subject: SwRasterizer: Calculate specular_1 for fragment lighting.

---
 src/video_core/swrasterizer/rasterizer.cpp | 62 ++++++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 3 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index 34b84b0af..e0c326a4a 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -148,8 +148,8 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
 
     Math::Vec3<float> light_vector = {};
     Math::Vec4<float> diffuse_sum = {0.f, 0.f, 0.f, 1.f};
-    // TODO(Subv): Calculate specular
     Math::Vec4<float> specular_sum = {0.f, 0.f, 0.f, 1.f};
+    Math::Vec3<float> refl_value = {};
 
     for (unsigned light_index = 0; light_index <= lighting.max_light_index; ++light_index) {
         unsigned num = lighting.light_enable.GetNum(light_index);
@@ -253,8 +253,64 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
 
         Math::Vec3<float> specular_0 = d0_lut_value * light_config.specular_0.ToVec3f();
 
-        // TODO(Subv): Specular 1
-        Math::Vec3<float> specular_1 = {};
+        // If enabled, lookup ReflectRed value, otherwise, 1.0 is used
+        if (lighting.config1.disable_lut_rr == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::ReflectRed)) {
+
+            float index = GetLutIndex(num, lighting.lut_input.rr, lighting.abs_lut_input.disable_rr == 0);
+
+            float scale = lighting.lut_scale.GetScale(lighting.lut_scale.rr);
+
+            refl_value.x = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::ReflectRed), index);
+        } else {
+            refl_value.x = 1.0f;
+        }
+
+        // If enabled, lookup ReflectGreen value, otherwise, ReflectRed value is used
+        if (lighting.config1.disable_lut_rg == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::ReflectGreen)) {
+
+            float index = GetLutIndex(num, lighting.lut_input.rg, lighting.abs_lut_input.disable_rg == 0);
+
+            float scale = lighting.lut_scale.GetScale(lighting.lut_scale.rg);
+
+            refl_value.y = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::ReflectGreen), index);
+        } else {
+            refl_value.y = refl_value.x;
+        }
+
+        // If enabled, lookup ReflectBlue value, otherwise, ReflectRed value is used
+        if (lighting.config1.disable_lut_rb == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::ReflectBlue)) {
+
+            float index = GetLutIndex(num, lighting.lut_input.rb, lighting.abs_lut_input.disable_rb == 0);
+
+            float scale = lighting.lut_scale.GetScale(lighting.lut_scale.rb);
+
+            refl_value.z = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::ReflectBlue), index);
+        } else {
+            refl_value.z = refl_value.x;
+        }
+
+        float d1_lut_value = 1.0f;
+        if (lighting.config1.disable_lut_d1 == 0 &&
+            LightingRegs::IsLightingSamplerSupported(
+                lighting.config0.config, LightingRegs::LightingSampler::Distribution1)) {
+
+            // Lookup specular "distribution 1" LUT value
+            float index = GetLutIndex(num, lighting.lut_input.d1.Value(), lighting.abs_lut_input.disable_d1 == 0);
+
+            float scale = lighting.lut_scale.GetScale(lighting.lut_scale.d1);
+
+            d1_lut_value = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::Distribution1), index);
+        }
+
+        Math::Vec3<float> specular_1 = d1_lut_value * refl_value * light_config.specular_1.ToVec3f();
+
+        // TODO(Subv): Fresnel
 
         auto diffuse = light_config.diffuse.ToVec3f() * dot_product + light_config.ambient.ToVec3f();
         diffuse_sum += Math::MakeVec(diffuse * dist_atten, 0.0f);
-- 
cgit v1.2.3


From 10b0bea06008fea89564dc5ef8895c0274f8ef18 Mon Sep 17 00:00:00 2001
From: Subv <subv2112@gmail.com>
Date: Fri, 9 Jun 2017 14:55:54 -0500
Subject: SwRasterizer: Calculate fresnel for fragment lighting.

---
 src/video_core/swrasterizer/rasterizer.cpp | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index e0c326a4a..2d1daa24a 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -310,7 +310,31 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
 
         Math::Vec3<float> specular_1 = d1_lut_value * refl_value * light_config.specular_1.ToVec3f();
 
-        // TODO(Subv): Fresnel
+        if (lighting.config1.disable_lut_fr == 0 &&
+            LightingRegs::IsLightingSamplerSupported(
+                lighting.config0.config, LightingRegs::LightingSampler::Fresnel)) {
+
+            // Lookup fresnel LUT value
+            float index = GetLutIndex(num, lighting.lut_input.fr.Value(), lighting.abs_lut_input.disable_fr == 0);
+
+            float scale = lighting.lut_scale.GetScale(lighting.lut_scale.fr);
+
+            float lut_value = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::Fresnel), index);
+
+            // Enabled for difffuse lighting alpha component
+            if (lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::PrimaryAlpha ||
+                lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
+                diffuse_sum.a() *= lut_value;
+            }
+
+            // Enabled for the specular lighting alpha component
+            if (lighting.config0.fresnel_selector ==
+                LightingRegs::LightingFresnelSelector::SecondaryAlpha ||
+                lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
+                specular_sum.a() *= lut_value;
+            }
+        }
+
 
         auto diffuse = light_config.diffuse.ToVec3f() * dot_product + light_config.ambient.ToVec3f();
         diffuse_sum += Math::MakeVec(diffuse * dist_atten, 0.0f);
-- 
cgit v1.2.3


From 80b6fc592e3a2f5821975e84b5df35f5dc4ae51a Mon Sep 17 00:00:00 2001
From: Subv <subv2112@gmail.com>
Date: Fri, 9 Jun 2017 15:24:28 -0500
Subject: SwRasterizer: Fixed the lighting lut lookup function.

---
 src/video_core/swrasterizer/rasterizer.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index 2d1daa24a..2b85ac86c 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -117,7 +117,9 @@ static std::tuple<float24, float24, PAddr> ConvertCubeCoord(float24 u, float24 v
 
 
 float LookupLightingLut(size_t lut_index, float index) {
-    unsigned index_i = static_cast<unsigned>(MathUtil::Clamp(floor(index * 256), 0.0f, 1.0f));
+    index *= 256;
+
+    unsigned index_i = static_cast<unsigned>(MathUtil::Clamp(floor(index), 0.0f, 255.0f));
 
     float index_f = index - index_i;
 
@@ -126,7 +128,7 @@ float LookupLightingLut(size_t lut_index, float index) {
     float lut_value = g_state.lighting.luts[lut_index][index_i].ToFloat();
     float lut_diff = g_state.lighting.luts[lut_index][index_i].DiffToFloat();
 
-    return lut_value + lut_diff * index_f;
+    return lut_value + lut_diff * index_f / 256.f;
 }
 
 std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view) {
-- 
cgit v1.2.3


From f2d4d5c2191275bd91f2f42b880f3edf3bccfd63 Mon Sep 17 00:00:00 2001
From: Subv <subv2112@gmail.com>
Date: Fri, 9 Jun 2017 17:33:25 -0500
Subject: SwRasterizer: Corrected the light LUT lookups.

---
 src/video_core/swrasterizer/rasterizer.cpp | 76 +++++++++++++++++-------------
 1 file changed, 43 insertions(+), 33 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index 2b85ac86c..a9098e1f0 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -115,20 +115,14 @@ static std::tuple<float24, float24, PAddr> ConvertCubeCoord(float24 u, float24 v
     return std::make_tuple(x / z * half + half, y / z * half + half, addr);
 }
 
-
-float LookupLightingLut(size_t lut_index, float index) {
-    index *= 256;
-
-    unsigned index_i = static_cast<unsigned>(MathUtil::Clamp(floor(index), 0.0f, 255.0f));
-
-    float index_f = index - index_i;
-
+float LookupLightingLut(size_t lut_index, u8 index, float delta) {
     ASSERT_MSG(lut_index < g_state.lighting.luts.size(), "Out of range lut");
+    ASSERT_MSG(index < g_state.lighting.luts[0].size(), "Out of range index");
 
-    float lut_value = g_state.lighting.luts[lut_index][index_i].ToFloat();
-    float lut_diff = g_state.lighting.luts[lut_index][index_i].DiffToFloat();
+    float lut_value = g_state.lighting.luts[lut_index][index].ToFloat();
+    float lut_diff = g_state.lighting.luts[lut_index][index].DiffToFloat();
 
-    return lut_value + lut_diff * index_f / 256.f;
+    return lut_value + lut_diff * delta;
 }
 
 std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view) {
@@ -145,8 +139,8 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
         UNIMPLEMENTED();
     }
 
-    // TODO(Subv): Do we need to normalize the quaternion here?
-    auto normal = Math::QuaternionRotate(normquat, surface_normal);
+    // Use the normalized the quaternion when performing the rotation
+    auto normal = Math::QuaternionRotate(normquat.Normalized(), surface_normal);
 
     Math::Vec3<float> light_vector = {};
     Math::Vec4<float> diffuse_sum = {0.f, 0.f, 0.f, 1.f};
@@ -182,7 +176,10 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
             size_t lut = static_cast<size_t>(LightingRegs::LightingSampler::DistanceAttenuation) + num;
 
             float sample_loc = scale * distance + bias;
-            dist_atten = LookupLightingLut(lut, sample_loc);
+
+            u8 lutindex = MathUtil::Clamp(floorf(sample_loc * 256.f), 0.0f, 255.0f);
+            float delta = sample_loc * 256 - lutindex;
+            dist_atten = LookupLightingLut(lut, lutindex, delta / 256.f);
         }
 
         float clamp_highlights = 1.0f;
@@ -195,7 +192,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
         }
 
         auto GetLutIndex = [&](unsigned num, LightingRegs::LightingLutInput input,
-                              bool abs) -> float {
+                              bool abs) -> std::tuple<u8, float> {
 
             Math::Vec3<float> norm_view = view.Normalized();
             Math::Vec3<float> half_angle = (norm_view + light_vector).Normalized();
@@ -229,14 +226,15 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
                     result = std::abs(result);
                 else
                     result = std::max(result, 0.0f);
-            } else {
-                if (result < 0.f)
-                    result += 2.f;
 
-                result /= 2.f;
+                u8 lutindex = MathUtil::Clamp(floorf(result * 256.f), 0.0f, 255.0f);
+                float delta = result * 256 - lutindex;
+                return { lutindex, delta / 256.f };
+            } else {
+                u8 tmpi = MathUtil::Clamp(floorf(result * 128.f), 0.0f, 127.0f);
+                float delta = result * 128.f - tmpi;
+                return { tmpi & 0xFF, delta / 128.f };
             }
-
-            return MathUtil::Clamp(result, 0.0f, 1.0f);
         };
 
         // Specular 0 component
@@ -246,11 +244,13 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
                 lighting.config0.config, LightingRegs::LightingSampler::Distribution0)) {
 
             // Lookup specular "distribution 0" LUT value
-            float index = GetLutIndex(num, lighting.lut_input.d0.Value(), lighting.abs_lut_input.disable_d0 == 0);
+            u8 index;
+            float delta;
+            std::tie(index, delta) = GetLutIndex(num, lighting.lut_input.d0.Value(), lighting.abs_lut_input.disable_d0 == 0);
 
             float scale = lighting.lut_scale.GetScale(lighting.lut_scale.d0);
 
-            d0_lut_value = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::Distribution0), index);
+            d0_lut_value = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::Distribution0), index, delta);
         }
 
         Math::Vec3<float> specular_0 = d0_lut_value * light_config.specular_0.ToVec3f();
@@ -260,11 +260,13 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
             LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
                                                      LightingRegs::LightingSampler::ReflectRed)) {
 
-            float index = GetLutIndex(num, lighting.lut_input.rr, lighting.abs_lut_input.disable_rr == 0);
+            u8 index;
+            float delta;
+            std::tie(index, delta) = GetLutIndex(num, lighting.lut_input.rr, lighting.abs_lut_input.disable_rr == 0);
 
             float scale = lighting.lut_scale.GetScale(lighting.lut_scale.rr);
 
-            refl_value.x = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::ReflectRed), index);
+            refl_value.x = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::ReflectRed), index, delta);
         } else {
             refl_value.x = 1.0f;
         }
@@ -274,11 +276,13 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
             LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
                                                      LightingRegs::LightingSampler::ReflectGreen)) {
 
-            float index = GetLutIndex(num, lighting.lut_input.rg, lighting.abs_lut_input.disable_rg == 0);
+            u8 index;
+            float delta;
+            std::tie(index, delta) = GetLutIndex(num, lighting.lut_input.rg, lighting.abs_lut_input.disable_rg == 0);
 
             float scale = lighting.lut_scale.GetScale(lighting.lut_scale.rg);
 
-            refl_value.y = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::ReflectGreen), index);
+            refl_value.y = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::ReflectGreen), index, delta);
         } else {
             refl_value.y = refl_value.x;
         }
@@ -288,11 +292,13 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
             LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
                                                      LightingRegs::LightingSampler::ReflectBlue)) {
 
-            float index = GetLutIndex(num, lighting.lut_input.rb, lighting.abs_lut_input.disable_rb == 0);
+            u8 index;
+            float delta;
+            std::tie(index, delta) = GetLutIndex(num, lighting.lut_input.rb, lighting.abs_lut_input.disable_rb == 0);
 
             float scale = lighting.lut_scale.GetScale(lighting.lut_scale.rb);
 
-            refl_value.z = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::ReflectBlue), index);
+            refl_value.z = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::ReflectBlue), index, delta);
         } else {
             refl_value.z = refl_value.x;
         }
@@ -303,11 +309,13 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
                 lighting.config0.config, LightingRegs::LightingSampler::Distribution1)) {
 
             // Lookup specular "distribution 1" LUT value
-            float index = GetLutIndex(num, lighting.lut_input.d1.Value(), lighting.abs_lut_input.disable_d1 == 0);
+            u8 index;
+            float delta;
+            std::tie(index, delta) = GetLutIndex(num, lighting.lut_input.d1.Value(), lighting.abs_lut_input.disable_d1 == 0);
 
             float scale = lighting.lut_scale.GetScale(lighting.lut_scale.d1);
 
-            d1_lut_value = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::Distribution1), index);
+            d1_lut_value = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::Distribution1), index, delta);
         }
 
         Math::Vec3<float> specular_1 = d1_lut_value * refl_value * light_config.specular_1.ToVec3f();
@@ -317,11 +325,13 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
                 lighting.config0.config, LightingRegs::LightingSampler::Fresnel)) {
 
             // Lookup fresnel LUT value
-            float index = GetLutIndex(num, lighting.lut_input.fr.Value(), lighting.abs_lut_input.disable_fr == 0);
+            u8 index;
+            float delta;
+            std::tie(index, delta) = GetLutIndex(num, lighting.lut_input.fr.Value(), lighting.abs_lut_input.disable_fr == 0);
 
             float scale = lighting.lut_scale.GetScale(lighting.lut_scale.fr);
 
-            float lut_value = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::Fresnel), index);
+            float lut_value = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::Fresnel), index, delta);
 
             // Enabled for difffuse lighting alpha component
             if (lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::PrimaryAlpha ||
-- 
cgit v1.2.3


From 2a75837bc30ba08e2470f4b91078747a08c5213a Mon Sep 17 00:00:00 2001
From: Subv <subv2112@gmail.com>
Date: Fri, 9 Jun 2017 18:18:57 -0500
Subject: SwRasterizer: Corrected the light LUT lookups.

---
 src/video_core/swrasterizer/rasterizer.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index a9098e1f0..2c804b6e7 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -177,9 +177,9 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
 
             float sample_loc = scale * distance + bias;
 
-            u8 lutindex = MathUtil::Clamp(floorf(sample_loc * 256.f), 0.0f, 255.0f);
+            u8 lutindex = MathUtil::Clamp(std::floor(sample_loc * 256.f), 0.0f, 255.0f);
             float delta = sample_loc * 256 - lutindex;
-            dist_atten = LookupLightingLut(lut, lutindex, delta / 256.f);
+            dist_atten = LookupLightingLut(lut, lutindex, delta);
         }
 
         float clamp_highlights = 1.0f;
@@ -227,13 +227,14 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
                 else
                     result = std::max(result, 0.0f);
 
-                u8 lutindex = MathUtil::Clamp(floorf(result * 256.f), 0.0f, 255.0f);
+                u8 lutindex = MathUtil::Clamp(std::floor(result * 256.f), 0.0f, 255.0f);
                 float delta = result * 256 - lutindex;
-                return { lutindex, delta / 256.f };
+                return { lutindex, delta };
             } else {
-                u8 tmpi = MathUtil::Clamp(floorf(result * 128.f), 0.0f, 127.0f);
+                float flr = std::floor(result * 128.f);
+                s8 tmpi = MathUtil::Clamp(flr, -128.0f, 127.0f);
                 float delta = result * 128.f - tmpi;
-                return { tmpi & 0xFF, delta / 128.f };
+                return { tmpi & 0xFF, delta };
             }
         };
 
-- 
cgit v1.2.3


From 73566ff7a990cdfe8d8f023997b57942dc785fc4 Mon Sep 17 00:00:00 2001
From: Subv <subv2112@gmail.com>
Date: Sun, 11 Jun 2017 11:55:35 -0500
Subject: SwRasterizer: Flip the vertex quaternions before clipping (if
 necessary).

---
 src/video_core/swrasterizer/clipper.cpp    | 11 +++++++++++
 src/video_core/swrasterizer/rasterizer.cpp | 24 ++++--------------------
 2 files changed, 15 insertions(+), 20 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/clipper.cpp b/src/video_core/swrasterizer/clipper.cpp
index 6fb923756..7537689b7 100644
--- a/src/video_core/swrasterizer/clipper.cpp
+++ b/src/video_core/swrasterizer/clipper.cpp
@@ -95,6 +95,17 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
     static const size_t MAX_VERTICES = 9;
     static_vector<Vertex, MAX_VERTICES> buffer_a = {v0, v1, v2};
     static_vector<Vertex, MAX_VERTICES> buffer_b;
+
+    auto FlipQuaternionIfOpposite = [](auto& a, const auto& b) {
+        if (Math::Dot(a, b) < float24::Zero())
+            a = -a;
+    };
+
+    // Flip the quaternions if they are opposite to prevent interpolating them over the wrong
+    // direction.
+    FlipQuaternionIfOpposite(buffer_a[1].quat, buffer_a[0].quat);
+    FlipQuaternionIfOpposite(buffer_a[2].quat, buffer_a[0].quat);
+
     auto* output_list = &buffer_a;
     auto* input_list = &buffer_b;
 
diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index 2c804b6e7..76f793c86 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -362,13 +362,6 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
     };
 }
 
-static bool AreQuaternionsOpposite(Math::Vec4<Pica::float24> qa, Math::Vec4<Pica::float24> qb) {
-    Math::Vec4f a{ qa.x.ToFloat32(), qa.y.ToFloat32(), qa.z.ToFloat32(), qa.w.ToFloat32() };
-    Math::Vec4f b{ qb.x.ToFloat32(), qb.y.ToFloat32(), qb.z.ToFloat32(), qb.w.ToFloat32() };
-
-    return (Math::Dot(a, b) < 0.f);
-}
-
 MICROPROFILE_DEFINE(GPU_Rasterization, "GPU", "Rasterization", MP_RGB(50, 50, 240));
 
 /**
@@ -462,15 +455,6 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
     int bias2 =
         IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0;
 
-    // Flip the quaternions if they are opposite to prevent interpolating them over the wrong direction.
-    auto v1_quat = v1.quat;
-    auto v2_quat = v2.quat;
-
-    if (AreQuaternionsOpposite(v0.quat, v1.quat))
-        v1_quat = v1_quat * float24::FromFloat32(-1.0f);
-    if (AreQuaternionsOpposite(v0.quat, v2.quat))
-        v2_quat = v2_quat * float24::FromFloat32(-1.0f);
-
     auto w_inverse = Math::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w);
 
     auto textures = regs.texturing.GetTextures();
@@ -571,11 +555,11 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
 
             Math::Quaternion<float> normquat{
                 {
-                    GetInterpolatedAttribute(v0.quat.x, v1_quat.x, v2_quat.x).ToFloat32(),
-                    GetInterpolatedAttribute(v0.quat.y, v1_quat.y, v2_quat.y).ToFloat32(),
-                    GetInterpolatedAttribute(v0.quat.z, v1_quat.z, v2_quat.z).ToFloat32()
+                    GetInterpolatedAttribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(),
+                    GetInterpolatedAttribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(),
+                    GetInterpolatedAttribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()
                 },
-                GetInterpolatedAttribute(v0.quat.w, v1_quat.w, v2_quat.w).ToFloat32(),
+                GetInterpolatedAttribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(),
             };
 
             Math::Vec3<float> fragment_position{
-- 
cgit v1.2.3


From 2d69a9b8bf232fdd9e3bbb2a9c624ee9dd6ec637 Mon Sep 17 00:00:00 2001
From: Subv <subv2112@gmail.com>
Date: Tue, 13 Jun 2017 12:31:28 -0500
Subject: SwRasterizer: Run clang-format

---
 src/video_core/swrasterizer/rasterizer.cpp | 128 +++++++++++++++++++----------
 1 file changed, 83 insertions(+), 45 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index 76f793c86..382b5927b 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -125,11 +125,12 @@ float LookupLightingLut(size_t lut_index, u8 index, float delta) {
     return lut_value + lut_diff * delta;
 }
 
-std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view) {
+std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
+    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view) {
     const auto& lighting = g_state.regs.lighting;
 
     if (lighting.disable)
-        return {{}, {}};
+        return {Math::MakeVec<u8>(0, 0, 0, 0), Math::MakeVec<u8>(0, 0, 0, 0)};
 
     // TODO(Subv): Bump mapping
     Math::Vec3<float> surface_normal = {0.0f, 0.0f, 1.0f};
@@ -151,7 +152,9 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
         unsigned num = lighting.light_enable.GetNum(light_index);
         const auto& light_config = g_state.regs.lighting.light[num];
 
-        Math::Vec3<float> position = {float16::FromRaw(light_config.x).ToFloat32(), float16::FromRaw(light_config.y).ToFloat32(), float16::FromRaw(light_config.z).ToFloat32()};
+        Math::Vec3<float> position = {float16::FromRaw(light_config.x).ToFloat32(),
+                                      float16::FromRaw(light_config.y).ToFloat32(),
+                                      float16::FromRaw(light_config.z).ToFloat32()};
 
         if (light_config.config.directional)
             light_vector = position;
@@ -173,11 +176,13 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
             auto distance = (-view - position).Length();
             float scale = Pica::float20::FromRaw(light_config.dist_atten_scale).ToFloat32();
             float bias = Pica::float20::FromRaw(light_config.dist_atten_scale).ToFloat32();
-            size_t lut = static_cast<size_t>(LightingRegs::LightingSampler::DistanceAttenuation) + num;
+            size_t lut =
+                static_cast<size_t>(LightingRegs::LightingSampler::DistanceAttenuation) + num;
 
             float sample_loc = scale * distance + bias;
 
-            u8 lutindex = MathUtil::Clamp(std::floor(sample_loc * 256.f), 0.0f, 255.0f);
+            u8 lutindex =
+                static_cast<u8>(MathUtil::Clamp(std::floor(sample_loc * 256.f), 0.0f, 255.0f));
             float delta = sample_loc * 256 - lutindex;
             dist_atten = LookupLightingLut(lut, lutindex, delta);
         }
@@ -192,7 +197,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
         }
 
         auto GetLutIndex = [&](unsigned num, LightingRegs::LightingLutInput input,
-                              bool abs) -> std::tuple<u8, float> {
+                               bool abs) -> std::tuple<u8, float> {
 
             Math::Vec3<float> norm_view = view.Normalized();
             Math::Vec3<float> half_angle = (norm_view + light_vector).Normalized();
@@ -216,7 +221,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
                 break;
 
             default:
-                LOG_CRITICAL(HW_GPU, "Unknown lighting LUT input %d\n", (int)input);
+                LOG_CRITICAL(HW_GPU, "Unknown lighting LUT input %u\n", static_cast<u32>(input));
                 UNIMPLEMENTED();
                 result = 0.f;
             }
@@ -227,14 +232,15 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
                 else
                     result = std::max(result, 0.0f);
 
-                u8 lutindex = MathUtil::Clamp(std::floor(result * 256.f), 0.0f, 255.0f);
+                float flr = std::floor(result * 256.f);
+                u8 lutindex = static_cast<u8>(MathUtil::Clamp(flr, 0.0f, 255.0f));
                 float delta = result * 256 - lutindex;
-                return { lutindex, delta };
+                return {lutindex, delta};
             } else {
                 float flr = std::floor(result * 128.f);
-                s8 tmpi = MathUtil::Clamp(flr, -128.0f, 127.0f);
-                float delta = result * 128.f - tmpi;
-                return { tmpi & 0xFF, delta };
+                s8 lutindex = static_cast<u8>(MathUtil::Clamp(flr, -128.0f, 127.0f));
+                float delta = result * 128.f - lutindex;
+                return {static_cast<u8>(lutindex), delta};
             }
         };
 
@@ -247,11 +253,15 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
             // Lookup specular "distribution 0" LUT value
             u8 index;
             float delta;
-            std::tie(index, delta) = GetLutIndex(num, lighting.lut_input.d0.Value(), lighting.abs_lut_input.disable_d0 == 0);
+            std::tie(index, delta) = GetLutIndex(num, lighting.lut_input.d0.Value(),
+                                                 lighting.abs_lut_input.disable_d0 == 0);
 
             float scale = lighting.lut_scale.GetScale(lighting.lut_scale.d0);
 
-            d0_lut_value = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::Distribution0), index, delta);
+            d0_lut_value =
+                scale *
+                LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::Distribution0),
+                                  index, delta);
         }
 
         Math::Vec3<float> specular_0 = d0_lut_value * light_config.specular_0.ToVec3f();
@@ -263,11 +273,15 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
 
             u8 index;
             float delta;
-            std::tie(index, delta) = GetLutIndex(num, lighting.lut_input.rr, lighting.abs_lut_input.disable_rr == 0);
+            std::tie(index, delta) =
+                GetLutIndex(num, lighting.lut_input.rr, lighting.abs_lut_input.disable_rr == 0);
 
             float scale = lighting.lut_scale.GetScale(lighting.lut_scale.rr);
 
-            refl_value.x = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::ReflectRed), index, delta);
+            refl_value.x =
+                scale *
+                LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::ReflectRed),
+                                  index, delta);
         } else {
             refl_value.x = 1.0f;
         }
@@ -279,11 +293,15 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
 
             u8 index;
             float delta;
-            std::tie(index, delta) = GetLutIndex(num, lighting.lut_input.rg, lighting.abs_lut_input.disable_rg == 0);
+            std::tie(index, delta) =
+                GetLutIndex(num, lighting.lut_input.rg, lighting.abs_lut_input.disable_rg == 0);
 
             float scale = lighting.lut_scale.GetScale(lighting.lut_scale.rg);
 
-            refl_value.y = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::ReflectGreen), index, delta);
+            refl_value.y =
+                scale *
+                LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::ReflectGreen),
+                                  index, delta);
         } else {
             refl_value.y = refl_value.x;
         }
@@ -295,11 +313,15 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
 
             u8 index;
             float delta;
-            std::tie(index, delta) = GetLutIndex(num, lighting.lut_input.rb, lighting.abs_lut_input.disable_rb == 0);
+            std::tie(index, delta) =
+                GetLutIndex(num, lighting.lut_input.rb, lighting.abs_lut_input.disable_rb == 0);
 
             float scale = lighting.lut_scale.GetScale(lighting.lut_scale.rb);
 
-            refl_value.z = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::ReflectBlue), index, delta);
+            refl_value.z =
+                scale *
+                LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::ReflectBlue),
+                                  index, delta);
         } else {
             refl_value.z = refl_value.x;
         }
@@ -312,54 +334,72 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(const Math::Qu
             // Lookup specular "distribution 1" LUT value
             u8 index;
             float delta;
-            std::tie(index, delta) = GetLutIndex(num, lighting.lut_input.d1.Value(), lighting.abs_lut_input.disable_d1 == 0);
+            std::tie(index, delta) = GetLutIndex(num, lighting.lut_input.d1.Value(),
+                                                 lighting.abs_lut_input.disable_d1 == 0);
 
             float scale = lighting.lut_scale.GetScale(lighting.lut_scale.d1);
 
-            d1_lut_value = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::Distribution1), index, delta);
+            d1_lut_value =
+                scale *
+                LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::Distribution1),
+                                  index, delta);
         }
 
-        Math::Vec3<float> specular_1 = d1_lut_value * refl_value * light_config.specular_1.ToVec3f();
+        Math::Vec3<float> specular_1 =
+            d1_lut_value * refl_value * light_config.specular_1.ToVec3f();
 
         if (lighting.config1.disable_lut_fr == 0 &&
-            LightingRegs::IsLightingSamplerSupported(
-                lighting.config0.config, LightingRegs::LightingSampler::Fresnel)) {
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::Fresnel)) {
 
             // Lookup fresnel LUT value
             u8 index;
             float delta;
-            std::tie(index, delta) = GetLutIndex(num, lighting.lut_input.fr.Value(), lighting.abs_lut_input.disable_fr == 0);
+            std::tie(index, delta) = GetLutIndex(num, lighting.lut_input.fr.Value(),
+                                                 lighting.abs_lut_input.disable_fr == 0);
 
             float scale = lighting.lut_scale.GetScale(lighting.lut_scale.fr);
 
-            float lut_value = scale * LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::Fresnel), index, delta);
+            float lut_value =
+                scale *
+                LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::Fresnel),
+                                  index, delta);
 
-            // Enabled for difffuse lighting alpha component
-            if (lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::PrimaryAlpha ||
+            // Enabled for diffuse lighting alpha component
+            if (lighting.config0.fresnel_selector ==
+                    LightingRegs::LightingFresnelSelector::PrimaryAlpha ||
                 lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
                 diffuse_sum.a() *= lut_value;
             }
 
             // Enabled for the specular lighting alpha component
             if (lighting.config0.fresnel_selector ==
-                LightingRegs::LightingFresnelSelector::SecondaryAlpha ||
+                    LightingRegs::LightingFresnelSelector::SecondaryAlpha ||
                 lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
                 specular_sum.a() *= lut_value;
             }
         }
 
-
-        auto diffuse = light_config.diffuse.ToVec3f() * dot_product + light_config.ambient.ToVec3f();
+        auto diffuse =
+            light_config.diffuse.ToVec3f() * dot_product + light_config.ambient.ToVec3f();
         diffuse_sum += Math::MakeVec(diffuse * dist_atten, 0.0f);
 
-        specular_sum += Math::MakeVec((specular_0 + specular_1) * clamp_highlights * dist_atten, 0.f);
+        specular_sum +=
+            Math::MakeVec((specular_0 + specular_1) * clamp_highlights * dist_atten, 0.f);
     }
 
     diffuse_sum += Math::MakeVec(lighting.global_ambient.ToVec3f(), 0.0f);
-    return {
-        Math::MakeVec<float>(MathUtil::Clamp(diffuse_sum.x, 0.0f, 1.0f) * 255, MathUtil::Clamp(diffuse_sum.y, 0.0f, 1.0f) * 255, MathUtil::Clamp(diffuse_sum.z, 0.0f, 1.0f) * 255, MathUtil::Clamp(diffuse_sum.w, 0.0f, 1.0f) * 255).Cast<u8>(),
-        Math::MakeVec<float>(MathUtil::Clamp(specular_sum.x, 0.0f, 1.0f) * 255, MathUtil::Clamp(specular_sum.y, 0.0f, 1.0f) * 255, MathUtil::Clamp(specular_sum.z, 0.0f, 1.0f) * 255, MathUtil::Clamp(specular_sum.w, 0.0f, 1.0f) * 255).Cast<u8>()
-    };
+
+    return {Math::MakeVec<float>(MathUtil::Clamp(diffuse_sum.x, 0.0f, 1.0f) * 255,
+                                 MathUtil::Clamp(diffuse_sum.y, 0.0f, 1.0f) * 255,
+                                 MathUtil::Clamp(diffuse_sum.z, 0.0f, 1.0f) * 255,
+                                 MathUtil::Clamp(diffuse_sum.w, 0.0f, 1.0f) * 255)
+                .Cast<u8>(),
+            Math::MakeVec<float>(MathUtil::Clamp(specular_sum.x, 0.0f, 1.0f) * 255,
+                                 MathUtil::Clamp(specular_sum.y, 0.0f, 1.0f) * 255,
+                                 MathUtil::Clamp(specular_sum.z, 0.0f, 1.0f) * 255,
+                                 MathUtil::Clamp(specular_sum.w, 0.0f, 1.0f) * 255)
+                .Cast<u8>()};
 }
 
 MICROPROFILE_DEFINE(GPU_Rasterization, "GPU", "Rasterization", MP_RGB(50, 50, 240));
@@ -554,19 +594,16 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
             };
 
             Math::Quaternion<float> normquat{
-                {
-                    GetInterpolatedAttribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(),
-                    GetInterpolatedAttribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(),
-                    GetInterpolatedAttribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()
-                },
+                {GetInterpolatedAttribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(),
+                 GetInterpolatedAttribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(),
+                 GetInterpolatedAttribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()},
                 GetInterpolatedAttribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(),
             };
 
             Math::Vec3<float> fragment_position{
                 GetInterpolatedAttribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(),
                 GetInterpolatedAttribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(),
-                GetInterpolatedAttribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32()
-            };
+                GetInterpolatedAttribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32()};
 
             Math::Vec2<float24> uv[3];
             uv[0].u() = GetInterpolatedAttribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u());
@@ -685,7 +722,8 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
             Math::Vec4<u8> primary_fragment_color;
             Math::Vec4<u8> secondary_fragment_color;
 
-            std::tie(primary_fragment_color, secondary_fragment_color) = ComputeFragmentsColors(normquat, fragment_position);
+            std::tie(primary_fragment_color, secondary_fragment_color) =
+                ComputeFragmentsColors(normquat, fragment_position);
 
             for (unsigned tev_stage_index = 0; tev_stage_index < tev_stages.size();
                  ++tev_stage_index) {
-- 
cgit v1.2.3


From 6250f52e939c714ccb302003502ee78941c8221b Mon Sep 17 00:00:00 2001
From: Subv <subv2112@gmail.com>
Date: Tue, 13 Jun 2017 12:36:45 -0500
Subject: SwRasterizer: Fixed a few conversion warnings and moved per-light
 values into the per-light loop.

---
 src/video_core/swrasterizer/rasterizer.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index 382b5927b..54af53bbd 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -143,18 +143,18 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
     // Use the normalized the quaternion when performing the rotation
     auto normal = Math::QuaternionRotate(normquat.Normalized(), surface_normal);
 
-    Math::Vec3<float> light_vector = {};
     Math::Vec4<float> diffuse_sum = {0.f, 0.f, 0.f, 1.f};
     Math::Vec4<float> specular_sum = {0.f, 0.f, 0.f, 1.f};
-    Math::Vec3<float> refl_value = {};
 
     for (unsigned light_index = 0; light_index <= lighting.max_light_index; ++light_index) {
         unsigned num = lighting.light_enable.GetNum(light_index);
         const auto& light_config = g_state.regs.lighting.light[num];
 
+        Math::Vec3<float> refl_value = {};
         Math::Vec3<float> position = {float16::FromRaw(light_config.x).ToFloat32(),
                                       float16::FromRaw(light_config.y).ToFloat32(),
                                       float16::FromRaw(light_config.z).ToFloat32()};
+        Math::Vec3<float> light_vector;
 
         if (light_config.config.directional)
             light_vector = position;
@@ -175,11 +175,12 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
         if (!lighting.IsDistAttenDisabled(num)) {
             auto distance = (-view - position).Length();
             float scale = Pica::float20::FromRaw(light_config.dist_atten_scale).ToFloat32();
-            float bias = Pica::float20::FromRaw(light_config.dist_atten_scale).ToFloat32();
+            float dist_aten_bias =
+                Pica::float20::FromRaw(light_config.dist_atten_scale).ToFloat32();
             size_t lut =
                 static_cast<size_t>(LightingRegs::LightingSampler::DistanceAttenuation) + num;
 
-            float sample_loc = scale * distance + bias;
+            float sample_loc = scale * distance + dist_aten_bias;
 
             u8 lutindex =
                 static_cast<u8>(MathUtil::Clamp(std::floor(sample_loc * 256.f), 0.0f, 255.0f));
@@ -238,7 +239,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
                 return {lutindex, delta};
             } else {
                 float flr = std::floor(result * 128.f);
-                s8 lutindex = static_cast<u8>(MathUtil::Clamp(flr, -128.0f, 127.0f));
+                s8 lutindex = static_cast<s8>(MathUtil::Clamp(flr, -128.0f, 127.0f));
                 float delta = result * 128.f - lutindex;
                 return {static_cast<u8>(lutindex), delta};
             }
-- 
cgit v1.2.3


From 37ac2b6657002e19d78cbc97841f8d3eee6ac5b8 Mon Sep 17 00:00:00 2001
From: Subv <subv2112@gmail.com>
Date: Tue, 13 Jun 2017 12:53:50 -0500
Subject: SwRasterizer/Lighting: Fixed a bug where the distance attenuation
 bias was being set to the dist atten scale.

---
 src/video_core/swrasterizer/rasterizer.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index 54af53bbd..48ed8ccbf 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -175,12 +175,11 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
         if (!lighting.IsDistAttenDisabled(num)) {
             auto distance = (-view - position).Length();
             float scale = Pica::float20::FromRaw(light_config.dist_atten_scale).ToFloat32();
-            float dist_aten_bias =
-                Pica::float20::FromRaw(light_config.dist_atten_scale).ToFloat32();
+            float bias = Pica::float20::FromRaw(light_config.dist_atten_bias).ToFloat32();
             size_t lut =
                 static_cast<size_t>(LightingRegs::LightingSampler::DistanceAttenuation) + num;
 
-            float sample_loc = scale * distance + dist_aten_bias;
+            float sample_loc = scale * distance + bias;
 
             u8 lutindex =
                 static_cast<u8>(MathUtil::Clamp(std::floor(sample_loc * 256.f), 0.0f, 255.0f));
-- 
cgit v1.2.3


From 7bc467e8725c6751eb44ea45ff2203af8692cda1 Mon Sep 17 00:00:00 2001
From: Subv <subv2112@gmail.com>
Date: Wed, 28 Jun 2017 12:34:16 -0500
Subject: SwRasterizer/Lighting: Do not use global state in LookupLightingLut.

---
 src/video_core/pica_state.h                |  2 +-
 src/video_core/swrasterizer/rasterizer.cpp | 33 +++++++++++++++++++-----------
 2 files changed, 22 insertions(+), 13 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/pica_state.h b/src/video_core/pica_state.h
index 2d23d34e6..864a2c9e6 100644
--- a/src/video_core/pica_state.h
+++ b/src/video_core/pica_state.h
@@ -79,7 +79,7 @@ struct State {
         std::array<ColorDifferenceEntry, 256> color_diff_table;
     } proctex;
 
-    struct {
+    struct Lighting {
         union LutEntry {
             // Used for raw access
             u32 raw;
diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index 48ed8ccbf..b69f7b692 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -115,12 +115,15 @@ static std::tuple<float24, float24, PAddr> ConvertCubeCoord(float24 u, float24 v
     return std::make_tuple(x / z * half + half, y / z * half + half, addr);
 }
 
-float LookupLightingLut(size_t lut_index, u8 index, float delta) {
-    ASSERT_MSG(lut_index < g_state.lighting.luts.size(), "Out of range lut");
-    ASSERT_MSG(index < g_state.lighting.luts[0].size(), "Out of range index");
+static float LookupLightingLut(const Pica::State::Lighting& lighting, size_t lut_index, u8 index,
+                        float delta) {
+    ASSERT_MSG(lut_index < lighting.luts.size(), "Out of range lut");
+    ASSERT_MSG(index < lighting.luts[0].size(), "Out of range index");
 
-    float lut_value = g_state.lighting.luts[lut_index][index].ToFloat();
-    float lut_diff = g_state.lighting.luts[lut_index][index].DiffToFloat();
+    const auto& lut = lighting.luts[lut_index][index];
+
+    float lut_value = lut.ToFloat();
+    float lut_diff = lut.DiffToFloat();
 
     return lut_value + lut_diff * delta;
 }
@@ -184,7 +187,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
             u8 lutindex =
                 static_cast<u8>(MathUtil::Clamp(std::floor(sample_loc * 256.f), 0.0f, 255.0f));
             float delta = sample_loc * 256 - lutindex;
-            dist_atten = LookupLightingLut(lut, lutindex, delta);
+            dist_atten = LookupLightingLut(g_state.lighting, lut, lutindex, delta);
         }
 
         float clamp_highlights = 1.0f;
@@ -260,7 +263,8 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
 
             d0_lut_value =
                 scale *
-                LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::Distribution0),
+                LookupLightingLut(g_state.lighting,
+                                  static_cast<size_t>(LightingRegs::LightingSampler::Distribution0),
                                   index, delta);
         }
 
@@ -280,7 +284,8 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
 
             refl_value.x =
                 scale *
-                LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::ReflectRed),
+                LookupLightingLut(g_state.lighting,
+                                  static_cast<size_t>(LightingRegs::LightingSampler::ReflectRed),
                                   index, delta);
         } else {
             refl_value.x = 1.0f;
@@ -300,7 +305,8 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
 
             refl_value.y =
                 scale *
-                LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::ReflectGreen),
+                LookupLightingLut(g_state.lighting,
+                                  static_cast<size_t>(LightingRegs::LightingSampler::ReflectGreen),
                                   index, delta);
         } else {
             refl_value.y = refl_value.x;
@@ -320,7 +326,8 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
 
             refl_value.z =
                 scale *
-                LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::ReflectBlue),
+                LookupLightingLut(g_state.lighting,
+                                  static_cast<size_t>(LightingRegs::LightingSampler::ReflectBlue),
                                   index, delta);
         } else {
             refl_value.z = refl_value.x;
@@ -341,7 +348,8 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
 
             d1_lut_value =
                 scale *
-                LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::Distribution1),
+                LookupLightingLut(g_state.lighting,
+                                  static_cast<size_t>(LightingRegs::LightingSampler::Distribution1),
                                   index, delta);
         }
 
@@ -362,7 +370,8 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
 
             float lut_value =
                 scale *
-                LookupLightingLut(static_cast<size_t>(LightingRegs::LightingSampler::Fresnel),
+                LookupLightingLut(g_state.lighting,
+                                  static_cast<size_t>(LightingRegs::LightingSampler::Fresnel),
                                   index, delta);
 
             // Enabled for diffuse lighting alpha component
-- 
cgit v1.2.3


From b8229a768434ab9b47123359669761c05ecdd6b0 Mon Sep 17 00:00:00 2001
From: Subv <subv2112@gmail.com>
Date: Wed, 28 Jun 2017 12:35:35 -0500
Subject: SwRasterizer/Lighting: Do not use global registers state in
 ComputeFragmentsColors.

---
 src/video_core/swrasterizer/rasterizer.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index b69f7b692..d2d77e8b0 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -129,8 +129,8 @@ static float LookupLightingLut(const Pica::State::Lighting& lighting, size_t lut
 }
 
 std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
-    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view) {
-    const auto& lighting = g_state.regs.lighting;
+    const Pica::LightingRegs& lighting, const Math::Quaternion<float>& normquat,
+    const Math::Vec3<float>& view) {
 
     if (lighting.disable)
         return {Math::MakeVec<u8>(0, 0, 0, 0), Math::MakeVec<u8>(0, 0, 0, 0)};
@@ -732,7 +732,7 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
             Math::Vec4<u8> secondary_fragment_color;
 
             std::tie(primary_fragment_color, secondary_fragment_color) =
-                ComputeFragmentsColors(normquat, fragment_position);
+                ComputeFragmentsColors(g_state.regs.lighting, normquat, fragment_position);
 
             for (unsigned tev_stage_index = 0; tev_stage_index < tev_stages.size();
                  ++tev_stage_index) {
-- 
cgit v1.2.3


From 7526af5e52ac1e24512faa1cd8f1a169407689fb Mon Sep 17 00:00:00 2001
From: Subv <subv2112@gmail.com>
Date: Wed, 28 Jun 2017 12:37:14 -0500
Subject: SwRasterizer/Lighting: Move the lighting enable check outside the
 ComputeFragmentsColors function.

---
 src/video_core/swrasterizer/rasterizer.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index d2d77e8b0..b2d2b6ef2 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -132,9 +132,6 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
     const Pica::LightingRegs& lighting, const Math::Quaternion<float>& normquat,
     const Math::Vec3<float>& view) {
 
-    if (lighting.disable)
-        return {Math::MakeVec<u8>(0, 0, 0, 0), Math::MakeVec<u8>(0, 0, 0, 0)};
-
     // TODO(Subv): Bump mapping
     Math::Vec3<float> surface_normal = {0.0f, 0.0f, 1.0f};
 
@@ -728,11 +725,13 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
                 regs.texturing.tev_combiner_buffer_color.a,
             };
 
-            Math::Vec4<u8> primary_fragment_color;
-            Math::Vec4<u8> secondary_fragment_color;
+            Math::Vec4<u8> primary_fragment_color = {0, 0, 0, 0};
+            Math::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0};
 
-            std::tie(primary_fragment_color, secondary_fragment_color) =
-                ComputeFragmentsColors(g_state.regs.lighting, normquat, fragment_position);
+            if (!g_state.regs.lighting.disable) {
+                std::tie(primary_fragment_color, secondary_fragment_color) =
+                    ComputeFragmentsColors(g_state.regs.lighting, normquat, fragment_position);
+            }
 
             for (unsigned tev_stage_index = 0; tev_stage_index < tev_stages.size();
                  ++tev_stage_index) {
-- 
cgit v1.2.3


From 9906feefbd37ebfd658fecc47e960f23adc6b190 Mon Sep 17 00:00:00 2001
From: Subv <subv2112@gmail.com>
Date: Wed, 28 Jun 2017 12:43:00 -0500
Subject: SwRasterizer/Lighting: Move the clamp highlight calculation to the
 end of the per-light loop body.

---
 src/video_core/swrasterizer/rasterizer.cpp | 34 +++++++++++++++---------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index b2d2b6ef2..2c7a1a815 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -163,14 +163,6 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
 
         light_vector.Normalize();
 
-        auto LV_N = Math::Dot(light_vector, normal);
-        auto dot_product = LV_N;
-
-        if (light_config.config.two_sided_diffuse)
-            dot_product = std::abs(dot_product);
-        else
-            dot_product = std::max(dot_product, 0.0f);
-
         float dist_atten = 1.0f;
         if (!lighting.IsDistAttenDisabled(num)) {
             auto distance = (-view - position).Length();
@@ -187,15 +179,6 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
             dist_atten = LookupLightingLut(g_state.lighting, lut, lutindex, delta);
         }
 
-        float clamp_highlights = 1.0f;
-
-        if (lighting.config0.clamp_highlights) {
-            if (LV_N <= 0.f)
-                clamp_highlights = 0.f;
-            else
-                clamp_highlights = 1.f;
-        }
-
         auto GetLutIndex = [&](unsigned num, LightingRegs::LightingLutInput input,
                                bool abs) -> std::tuple<u8, float> {
 
@@ -386,6 +369,23 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
             }
         }
 
+        auto dot_product = Math::Dot(light_vector, normal);
+
+        // Calculate clamp highlights before applying the two-sided diffuse configuration to the dot
+        // product.
+        float clamp_highlights = 1.0f;
+        if (lighting.config0.clamp_highlights) {
+            if (dot_product <= 0.f)
+                clamp_highlights = 0.f;
+            else
+                clamp_highlights = 1.f;
+        }
+
+        if (light_config.config.two_sided_diffuse)
+            dot_product = std::abs(dot_product);
+        else
+            dot_product = std::max(dot_product, 0.0f);
+
         auto diffuse =
             light_config.diffuse.ToVec3f() * dot_product + light_config.ambient.ToVec3f();
         diffuse_sum += Math::MakeVec(diffuse * dist_atten, 0.0f);
-- 
cgit v1.2.3


From efc655aec00d43d53c41b55d9a94d17ce81e5942 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Tue, 11 Jul 2017 20:06:26 +0300
Subject: SwRasterizer/Lighting: pass lighting state as parameter

---
 src/video_core/swrasterizer/rasterizer.cpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index 2c7a1a815..b108a0f86 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -116,7 +116,7 @@ static std::tuple<float24, float24, PAddr> ConvertCubeCoord(float24 u, float24 v
 }
 
 static float LookupLightingLut(const Pica::State::Lighting& lighting, size_t lut_index, u8 index,
-                        float delta) {
+                               float delta) {
     ASSERT_MSG(lut_index < lighting.luts.size(), "Out of range lut");
     ASSERT_MSG(index < lighting.luts[0].size(), "Out of range index");
 
@@ -129,8 +129,8 @@ static float LookupLightingLut(const Pica::State::Lighting& lighting, size_t lut
 }
 
 std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
-    const Pica::LightingRegs& lighting, const Math::Quaternion<float>& normquat,
-    const Math::Vec3<float>& view) {
+    const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state,
+    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view) {
 
     // TODO(Subv): Bump mapping
     Math::Vec3<float> surface_normal = {0.0f, 0.0f, 1.0f};
@@ -148,7 +148,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
 
     for (unsigned light_index = 0; light_index <= lighting.max_light_index; ++light_index) {
         unsigned num = lighting.light_enable.GetNum(light_index);
-        const auto& light_config = g_state.regs.lighting.light[num];
+        const auto& light_config = lighting.light[num];
 
         Math::Vec3<float> refl_value = {};
         Math::Vec3<float> position = {float16::FromRaw(light_config.x).ToFloat32(),
@@ -176,7 +176,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
             u8 lutindex =
                 static_cast<u8>(MathUtil::Clamp(std::floor(sample_loc * 256.f), 0.0f, 255.0f));
             float delta = sample_loc * 256 - lutindex;
-            dist_atten = LookupLightingLut(g_state.lighting, lut, lutindex, delta);
+            dist_atten = LookupLightingLut(lighting_state, lut, lutindex, delta);
         }
 
         auto GetLutIndex = [&](unsigned num, LightingRegs::LightingLutInput input,
@@ -243,7 +243,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
 
             d0_lut_value =
                 scale *
-                LookupLightingLut(g_state.lighting,
+                LookupLightingLut(lighting_state,
                                   static_cast<size_t>(LightingRegs::LightingSampler::Distribution0),
                                   index, delta);
         }
@@ -264,7 +264,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
 
             refl_value.x =
                 scale *
-                LookupLightingLut(g_state.lighting,
+                LookupLightingLut(lighting_state,
                                   static_cast<size_t>(LightingRegs::LightingSampler::ReflectRed),
                                   index, delta);
         } else {
@@ -285,7 +285,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
 
             refl_value.y =
                 scale *
-                LookupLightingLut(g_state.lighting,
+                LookupLightingLut(lighting_state,
                                   static_cast<size_t>(LightingRegs::LightingSampler::ReflectGreen),
                                   index, delta);
         } else {
@@ -306,7 +306,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
 
             refl_value.z =
                 scale *
-                LookupLightingLut(g_state.lighting,
+                LookupLightingLut(lighting_state,
                                   static_cast<size_t>(LightingRegs::LightingSampler::ReflectBlue),
                                   index, delta);
         } else {
@@ -328,7 +328,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
 
             d1_lut_value =
                 scale *
-                LookupLightingLut(g_state.lighting,
+                LookupLightingLut(lighting_state,
                                   static_cast<size_t>(LightingRegs::LightingSampler::Distribution1),
                                   index, delta);
         }
@@ -350,7 +350,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
 
             float lut_value =
                 scale *
-                LookupLightingLut(g_state.lighting,
+                LookupLightingLut(lighting_state,
                                   static_cast<size_t>(LightingRegs::LightingSampler::Fresnel),
                                   index, delta);
 
@@ -729,8 +729,8 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
             Math::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0};
 
             if (!g_state.regs.lighting.disable) {
-                std::tie(primary_fragment_color, secondary_fragment_color) =
-                    ComputeFragmentsColors(g_state.regs.lighting, normquat, fragment_position);
+                std::tie(primary_fragment_color, secondary_fragment_color) = ComputeFragmentsColors(
+                    g_state.regs.lighting, g_state.lighting, normquat, fragment_position);
             }
 
             for (unsigned tev_stage_index = 0; tev_stage_index < tev_stages.size();
-- 
cgit v1.2.3


From f13cf506e0b0e42e6c9b00b163aaabc3b63fb7ea Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Tue, 11 Jul 2017 20:15:23 +0300
Subject: SwRasterizer: only interpolate quat and view when lighting is enabled

---
 src/video_core/swrasterizer/rasterizer.cpp | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index b108a0f86..5844c401c 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -599,18 +599,6 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
                     255),
             };
 
-            Math::Quaternion<float> normquat{
-                {GetInterpolatedAttribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(),
-                 GetInterpolatedAttribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(),
-                 GetInterpolatedAttribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()},
-                GetInterpolatedAttribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(),
-            };
-
-            Math::Vec3<float> fragment_position{
-                GetInterpolatedAttribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(),
-                GetInterpolatedAttribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(),
-                GetInterpolatedAttribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32()};
-
             Math::Vec2<float24> uv[3];
             uv[0].u() = GetInterpolatedAttribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u());
             uv[0].v() = GetInterpolatedAttribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v());
@@ -729,8 +717,20 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
             Math::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0};
 
             if (!g_state.regs.lighting.disable) {
-                std::tie(primary_fragment_color, secondary_fragment_color) = ComputeFragmentsColors(
-                    g_state.regs.lighting, g_state.lighting, normquat, fragment_position);
+                Math::Quaternion<float> normquat{
+                    {GetInterpolatedAttribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(),
+                     GetInterpolatedAttribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(),
+                     GetInterpolatedAttribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()},
+                    GetInterpolatedAttribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(),
+                };
+
+                Math::Vec3<float> view{
+                    GetInterpolatedAttribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(),
+                    GetInterpolatedAttribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(),
+                    GetInterpolatedAttribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(),
+                };
+                std::tie(primary_fragment_color, secondary_fragment_color) =
+                    ComputeFragmentsColors(g_state.regs.lighting, g_state.lighting, normquat, view);
             }
 
             for (unsigned tev_stage_index = 0; tev_stage_index < tev_stages.size();
-- 
cgit v1.2.3


From c6d1472513394cc55b5d5a852d5f76b5e9a51f2b Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Tue, 11 Jul 2017 21:36:19 +0300
Subject: SwRasterizer/Lighting: refactor GetLutValue into a function.

merging similar pattern. Also makes the code more similar to the gl one
---
 src/video_core/swrasterizer/rasterizer.cpp | 110 +++++++----------------------
 1 file changed, 27 insertions(+), 83 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index 5844c401c..53c3bb585 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -179,9 +179,9 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
             dist_atten = LookupLightingLut(lighting_state, lut, lutindex, delta);
         }
 
-        auto GetLutIndex = [&](unsigned num, LightingRegs::LightingLutInput input,
-                               bool abs) -> std::tuple<u8, float> {
-
+        auto GetLutValue = [&](LightingRegs::LightingLutInput input, bool abs,
+                               LightingRegs::LightingScale scale_enum,
+                               LightingRegs::LightingSampler sampler) {
             Math::Vec3<float> norm_view = view.Normalized();
             Math::Vec3<float> half_angle = (norm_view + light_vector).Normalized();
             float result = 0.0f;
@@ -209,6 +209,9 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
                 result = 0.f;
             }
 
+            u8 index;
+            float delta;
+
             if (abs) {
                 if (light_config.config.two_sided_diffuse)
                     result = std::abs(result);
@@ -216,15 +219,18 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
                     result = std::max(result, 0.0f);
 
                 float flr = std::floor(result * 256.f);
-                u8 lutindex = static_cast<u8>(MathUtil::Clamp(flr, 0.0f, 255.0f));
-                float delta = result * 256 - lutindex;
-                return {lutindex, delta};
+                index = static_cast<u8>(MathUtil::Clamp(flr, 0.0f, 255.0f));
+                delta = result * 256 - index;
             } else {
                 float flr = std::floor(result * 128.f);
-                s8 lutindex = static_cast<s8>(MathUtil::Clamp(flr, -128.0f, 127.0f));
-                float delta = result * 128.f - lutindex;
-                return {static_cast<u8>(lutindex), delta};
+                s8 signed_index = static_cast<s8>(MathUtil::Clamp(flr, -128.0f, 127.0f));
+                delta = result * 128.f - signed_index;
+                index = static_cast<u8>(signed_index);
             }
+
+            float scale = lighting.lut_scale.GetScale(scale_enum);
+            return scale *
+                   LookupLightingLut(lighting_state, static_cast<size_t>(sampler), index, delta);
         };
 
         // Specular 0 component
@@ -232,20 +238,9 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
         if (lighting.config1.disable_lut_d0 == 0 &&
             LightingRegs::IsLightingSamplerSupported(
                 lighting.config0.config, LightingRegs::LightingSampler::Distribution0)) {
-
-            // Lookup specular "distribution 0" LUT value
-            u8 index;
-            float delta;
-            std::tie(index, delta) = GetLutIndex(num, lighting.lut_input.d0.Value(),
-                                                 lighting.abs_lut_input.disable_d0 == 0);
-
-            float scale = lighting.lut_scale.GetScale(lighting.lut_scale.d0);
-
             d0_lut_value =
-                scale *
-                LookupLightingLut(lighting_state,
-                                  static_cast<size_t>(LightingRegs::LightingSampler::Distribution0),
-                                  index, delta);
+                GetLutValue(lighting.lut_input.d0, lighting.abs_lut_input.disable_d0 == 0,
+                            lighting.lut_scale.d0, LightingRegs::LightingSampler::Distribution0);
         }
 
         Math::Vec3<float> specular_0 = d0_lut_value * light_config.specular_0.ToVec3f();
@@ -254,19 +249,9 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
         if (lighting.config1.disable_lut_rr == 0 &&
             LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
                                                      LightingRegs::LightingSampler::ReflectRed)) {
-
-            u8 index;
-            float delta;
-            std::tie(index, delta) =
-                GetLutIndex(num, lighting.lut_input.rr, lighting.abs_lut_input.disable_rr == 0);
-
-            float scale = lighting.lut_scale.GetScale(lighting.lut_scale.rr);
-
             refl_value.x =
-                scale *
-                LookupLightingLut(lighting_state,
-                                  static_cast<size_t>(LightingRegs::LightingSampler::ReflectRed),
-                                  index, delta);
+                GetLutValue(lighting.lut_input.rr, lighting.abs_lut_input.disable_rr == 0,
+                            lighting.lut_scale.rr, LightingRegs::LightingSampler::ReflectRed);
         } else {
             refl_value.x = 1.0f;
         }
@@ -275,19 +260,9 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
         if (lighting.config1.disable_lut_rg == 0 &&
             LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
                                                      LightingRegs::LightingSampler::ReflectGreen)) {
-
-            u8 index;
-            float delta;
-            std::tie(index, delta) =
-                GetLutIndex(num, lighting.lut_input.rg, lighting.abs_lut_input.disable_rg == 0);
-
-            float scale = lighting.lut_scale.GetScale(lighting.lut_scale.rg);
-
             refl_value.y =
-                scale *
-                LookupLightingLut(lighting_state,
-                                  static_cast<size_t>(LightingRegs::LightingSampler::ReflectGreen),
-                                  index, delta);
+                GetLutValue(lighting.lut_input.rg, lighting.abs_lut_input.disable_rg == 0,
+                            lighting.lut_scale.rg, LightingRegs::LightingSampler::ReflectGreen);
         } else {
             refl_value.y = refl_value.x;
         }
@@ -296,19 +271,9 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
         if (lighting.config1.disable_lut_rb == 0 &&
             LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
                                                      LightingRegs::LightingSampler::ReflectBlue)) {
-
-            u8 index;
-            float delta;
-            std::tie(index, delta) =
-                GetLutIndex(num, lighting.lut_input.rb, lighting.abs_lut_input.disable_rb == 0);
-
-            float scale = lighting.lut_scale.GetScale(lighting.lut_scale.rb);
-
             refl_value.z =
-                scale *
-                LookupLightingLut(lighting_state,
-                                  static_cast<size_t>(LightingRegs::LightingSampler::ReflectBlue),
-                                  index, delta);
+                GetLutValue(lighting.lut_input.rb, lighting.abs_lut_input.disable_rb == 0,
+                            lighting.lut_scale.rb, LightingRegs::LightingSampler::ReflectBlue);
         } else {
             refl_value.z = refl_value.x;
         }
@@ -317,20 +282,9 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
         if (lighting.config1.disable_lut_d1 == 0 &&
             LightingRegs::IsLightingSamplerSupported(
                 lighting.config0.config, LightingRegs::LightingSampler::Distribution1)) {
-
-            // Lookup specular "distribution 1" LUT value
-            u8 index;
-            float delta;
-            std::tie(index, delta) = GetLutIndex(num, lighting.lut_input.d1.Value(),
-                                                 lighting.abs_lut_input.disable_d1 == 0);
-
-            float scale = lighting.lut_scale.GetScale(lighting.lut_scale.d1);
-
             d1_lut_value =
-                scale *
-                LookupLightingLut(lighting_state,
-                                  static_cast<size_t>(LightingRegs::LightingSampler::Distribution1),
-                                  index, delta);
+                GetLutValue(lighting.lut_input.d1, lighting.abs_lut_input.disable_d1 == 0,
+                            lighting.lut_scale.d1, LightingRegs::LightingSampler::Distribution1);
         }
 
         Math::Vec3<float> specular_1 =
@@ -340,19 +294,9 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
             LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
                                                      LightingRegs::LightingSampler::Fresnel)) {
 
-            // Lookup fresnel LUT value
-            u8 index;
-            float delta;
-            std::tie(index, delta) = GetLutIndex(num, lighting.lut_input.fr.Value(),
-                                                 lighting.abs_lut_input.disable_fr == 0);
-
-            float scale = lighting.lut_scale.GetScale(lighting.lut_scale.fr);
-
             float lut_value =
-                scale *
-                LookupLightingLut(lighting_state,
-                                  static_cast<size_t>(LightingRegs::LightingSampler::Fresnel),
-                                  index, delta);
+                GetLutValue(lighting.lut_input.fr, lighting.abs_lut_input.disable_fr == 0,
+                            lighting.lut_scale.fr, LightingRegs::LightingSampler::Fresnel);
 
             // Enabled for diffuse lighting alpha component
             if (lighting.config0.fresnel_selector ==
-- 
cgit v1.2.3


From e415558a4fc471bc3ac2d22dd8052aeb63769c6e Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Tue, 11 Jul 2017 21:47:55 +0300
Subject: SwRasterizer/Lighting: get rid of nested return

---
 src/video_core/swrasterizer/rasterizer.cpp | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index 53c3bb585..e46790f85 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -340,16 +340,17 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
 
     diffuse_sum += Math::MakeVec(lighting.global_ambient.ToVec3f(), 0.0f);
 
-    return {Math::MakeVec<float>(MathUtil::Clamp(diffuse_sum.x, 0.0f, 1.0f) * 255,
-                                 MathUtil::Clamp(diffuse_sum.y, 0.0f, 1.0f) * 255,
-                                 MathUtil::Clamp(diffuse_sum.z, 0.0f, 1.0f) * 255,
-                                 MathUtil::Clamp(diffuse_sum.w, 0.0f, 1.0f) * 255)
-                .Cast<u8>(),
-            Math::MakeVec<float>(MathUtil::Clamp(specular_sum.x, 0.0f, 1.0f) * 255,
-                                 MathUtil::Clamp(specular_sum.y, 0.0f, 1.0f) * 255,
-                                 MathUtil::Clamp(specular_sum.z, 0.0f, 1.0f) * 255,
-                                 MathUtil::Clamp(specular_sum.w, 0.0f, 1.0f) * 255)
-                .Cast<u8>()};
+    auto diffuse = Math::MakeVec<float>(MathUtil::Clamp(diffuse_sum.x, 0.0f, 1.0f) * 255,
+                                        MathUtil::Clamp(diffuse_sum.y, 0.0f, 1.0f) * 255,
+                                        MathUtil::Clamp(diffuse_sum.z, 0.0f, 1.0f) * 255,
+                                        MathUtil::Clamp(diffuse_sum.w, 0.0f, 1.0f) * 255)
+                       .Cast<u8>();
+    auto specular = Math::MakeVec<float>(MathUtil::Clamp(specular_sum.x, 0.0f, 1.0f) * 255,
+                                         MathUtil::Clamp(specular_sum.y, 0.0f, 1.0f) * 255,
+                                         MathUtil::Clamp(specular_sum.z, 0.0f, 1.0f) * 255,
+                                         MathUtil::Clamp(specular_sum.w, 0.0f, 1.0f) * 255)
+                        .Cast<u8>();
+    return {diffuse, specular};
 }
 
 MICROPROFILE_DEFINE(GPU_Rasterization, "GPU", "Rasterization", MP_RGB(50, 50, 240));
-- 
cgit v1.2.3


From 56e5425e593e29aecf255c441791f2e24512f418 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Tue, 11 Jul 2017 22:07:19 +0300
Subject: SwRasterizer/Lighting: unify float suffix

---
 src/video_core/swrasterizer/rasterizer.cpp | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index e46790f85..c83680629 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -143,8 +143,8 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
     // Use the normalized the quaternion when performing the rotation
     auto normal = Math::QuaternionRotate(normquat.Normalized(), surface_normal);
 
-    Math::Vec4<float> diffuse_sum = {0.f, 0.f, 0.f, 1.f};
-    Math::Vec4<float> specular_sum = {0.f, 0.f, 0.f, 1.f};
+    Math::Vec4<float> diffuse_sum = {0.0f, 0.0f, 0.0f, 1.0f};
+    Math::Vec4<float> specular_sum = {0.0f, 0.0f, 0.0f, 1.0f};
 
     for (unsigned light_index = 0; light_index <= lighting.max_light_index; ++light_index) {
         unsigned num = lighting.light_enable.GetNum(light_index);
@@ -174,7 +174,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
             float sample_loc = scale * distance + bias;
 
             u8 lutindex =
-                static_cast<u8>(MathUtil::Clamp(std::floor(sample_loc * 256.f), 0.0f, 255.0f));
+                static_cast<u8>(MathUtil::Clamp(std::floor(sample_loc * 256.0f), 0.0f, 255.0f));
             float delta = sample_loc * 256 - lutindex;
             dist_atten = LookupLightingLut(lighting_state, lut, lutindex, delta);
         }
@@ -206,7 +206,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
             default:
                 LOG_CRITICAL(HW_GPU, "Unknown lighting LUT input %u\n", static_cast<u32>(input));
                 UNIMPLEMENTED();
-                result = 0.f;
+                result = 0.0f;
             }
 
             u8 index;
@@ -218,13 +218,13 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
                 else
                     result = std::max(result, 0.0f);
 
-                float flr = std::floor(result * 256.f);
+                float flr = std::floor(result * 256.0f);
                 index = static_cast<u8>(MathUtil::Clamp(flr, 0.0f, 255.0f));
                 delta = result * 256 - index;
             } else {
-                float flr = std::floor(result * 128.f);
+                float flr = std::floor(result * 128.0f);
                 s8 signed_index = static_cast<s8>(MathUtil::Clamp(flr, -128.0f, 127.0f));
-                delta = result * 128.f - signed_index;
+                delta = result * 128.0f - signed_index;
                 index = static_cast<u8>(signed_index);
             }
 
@@ -278,6 +278,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
             refl_value.z = refl_value.x;
         }
 
+        // Specular 1 component
         float d1_lut_value = 1.0f;
         if (lighting.config1.disable_lut_d1 == 0 &&
             LightingRegs::IsLightingSamplerSupported(
@@ -290,6 +291,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
         Math::Vec3<float> specular_1 =
             d1_lut_value * refl_value * light_config.specular_1.ToVec3f();
 
+        // Fresnel
         if (lighting.config1.disable_lut_fr == 0 &&
             LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
                                                      LightingRegs::LightingSampler::Fresnel)) {
@@ -319,10 +321,10 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
         // product.
         float clamp_highlights = 1.0f;
         if (lighting.config0.clamp_highlights) {
-            if (dot_product <= 0.f)
-                clamp_highlights = 0.f;
+            if (dot_product <= 0.0f)
+                clamp_highlights = 0.0f;
             else
-                clamp_highlights = 1.f;
+                clamp_highlights = 1.0f;
         }
 
         if (light_config.config.two_sided_diffuse)
@@ -335,7 +337,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
         diffuse_sum += Math::MakeVec(diffuse * dist_atten, 0.0f);
 
         specular_sum +=
-            Math::MakeVec((specular_0 + specular_1) * clamp_highlights * dist_atten, 0.f);
+            Math::MakeVec((specular_0 + specular_1) * clamp_highlights * dist_atten, 0.0f);
     }
 
     diffuse_sum += Math::MakeVec(lighting.global_ambient.ToVec3f(), 0.0f);
-- 
cgit v1.2.3


From 4feff63ffaec4d62d5bdfc85968cc99298907767 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Tue, 11 Jul 2017 22:19:00 +0300
Subject: SwRasterizer/Lighting: dist atten lut input need to be clamp

---
 src/video_core/swrasterizer/rasterizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index c83680629..37d1313cf 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -171,7 +171,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
             size_t lut =
                 static_cast<size_t>(LightingRegs::LightingSampler::DistanceAttenuation) + num;
 
-            float sample_loc = scale * distance + bias;
+            float sample_loc = MathUtil::Clamp(scale * distance + bias, 0.0f, 1.0f);
 
             u8 lutindex =
                 static_cast<u8>(MathUtil::Clamp(std::floor(sample_loc * 256.0f), 0.0f, 255.0f));
-- 
cgit v1.2.3


From c89f804a01ef4c54de6051c3ce8c70d7e66812b0 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Thu, 27 Jul 2017 13:48:27 +0300
Subject: pica/shader_interpreter: fix off-by-one in LOOP

---
 src/video_core/shader/shader_interpreter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/video_core')

diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index aa1cec81f..206c0978a 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -631,7 +631,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
                 state.address_registers[2] = loop_param.y;
 
                 Record<DebugDataRecord::LOOP_INT_IN>(debug_data, iteration, loop_param);
-                call(program_counter + 1, instr.flow_control.dest_offset - program_counter + 1,
+                call(program_counter + 1, instr.flow_control.dest_offset - program_counter,
                      instr.flow_control.dest_offset + 1, loop_param.x, loop_param.z);
                 break;
             }
-- 
cgit v1.2.3


From c59ed47608367de8cd5e4e6d58da02dee30810a9 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Wed, 2 Aug 2017 22:05:53 +0300
Subject: SwRasterizer/Lighting: move quaternion normalization to the caller

---
 src/video_core/swrasterizer/rasterizer.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index 37d1313cf..80ecf72ec 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -141,7 +141,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
     }
 
     // Use the normalized the quaternion when performing the rotation
-    auto normal = Math::QuaternionRotate(normquat.Normalized(), surface_normal);
+    auto normal = Math::QuaternionRotate(normquat, surface_normal);
 
     Math::Vec4<float> diffuse_sum = {0.0f, 0.0f, 0.0f, 1.0f};
     Math::Vec4<float> specular_sum = {0.0f, 0.0f, 0.0f, 1.0f};
@@ -664,12 +664,12 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
             Math::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0};
 
             if (!g_state.regs.lighting.disable) {
-                Math::Quaternion<float> normquat{
+                Math::Quaternion<float> normquat = Math::Quaternion<float>{
                     {GetInterpolatedAttribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(),
                      GetInterpolatedAttribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(),
                      GetInterpolatedAttribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()},
                     GetInterpolatedAttribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(),
-                };
+                }.Normalized();
 
                 Math::Vec3<float> view{
                     GetInterpolatedAttribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(),
-- 
cgit v1.2.3


From 48b410587116c92339d936ed3b1fd00aba38d6b5 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Wed, 2 Aug 2017 22:07:15 +0300
Subject: SwRasterizer/Lighting: reduce confusion

---
 src/video_core/swrasterizer/rasterizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index 80ecf72ec..aee630954 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -118,7 +118,7 @@ static std::tuple<float24, float24, PAddr> ConvertCubeCoord(float24 u, float24 v
 static float LookupLightingLut(const Pica::State::Lighting& lighting, size_t lut_index, u8 index,
                                float delta) {
     ASSERT_MSG(lut_index < lighting.luts.size(), "Out of range lut");
-    ASSERT_MSG(index < lighting.luts[0].size(), "Out of range index");
+    ASSERT_MSG(index < lighting.luts[lut_index].size(), "Out of range index");
 
     const auto& lut = lighting.luts[lut_index][index];
 
-- 
cgit v1.2.3


From eda28266fb1f0eb96a2096cadb41b62db3dc2d2e Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Wed, 2 Aug 2017 22:20:40 +0300
Subject: SwRasterizer/Lighting: move to its own file

---
 src/video_core/CMakeLists.txt                     |   2 +
 src/video_core/swrasterizer/fragment_lighting.cpp | 250 ++++++++++++++++++++++
 src/video_core/swrasterizer/fragment_lighting.h   |  18 ++
 src/video_core/swrasterizer/rasterizer.cpp        | 241 +--------------------
 4 files changed, 271 insertions(+), 240 deletions(-)
 create mode 100644 src/video_core/swrasterizer/fragment_lighting.cpp
 create mode 100644 src/video_core/swrasterizer/fragment_lighting.h

(limited to 'src/video_core')

diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 0961a3251..b2280f2ef 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -14,6 +14,7 @@ set(SRCS
             shader/shader.cpp
             shader/shader_interpreter.cpp
             swrasterizer/clipper.cpp
+            swrasterizer/fragment_lighting.cpp
             swrasterizer/framebuffer.cpp
             swrasterizer/proctex.cpp
             swrasterizer/rasterizer.cpp
@@ -54,6 +55,7 @@ set(HEADERS
             shader/shader.h
             shader/shader_interpreter.h
             swrasterizer/clipper.h
+            swrasterizer/fragment_lighting.h
             swrasterizer/framebuffer.h
             swrasterizer/proctex.h
             swrasterizer/rasterizer.h
diff --git a/src/video_core/swrasterizer/fragment_lighting.cpp b/src/video_core/swrasterizer/fragment_lighting.cpp
new file mode 100644
index 000000000..45a86b5cd
--- /dev/null
+++ b/src/video_core/swrasterizer/fragment_lighting.cpp
@@ -0,0 +1,250 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/math_util.h"
+#include "video_core/swrasterizer/fragment_lighting.h"
+
+namespace Pica {
+
+static float LookupLightingLut(const Pica::State::Lighting& lighting, size_t lut_index, u8 index,
+                               float delta) {
+    ASSERT_MSG(lut_index < lighting.luts.size(), "Out of range lut");
+    ASSERT_MSG(index < lighting.luts[lut_index].size(), "Out of range index");
+
+    const auto& lut = lighting.luts[lut_index][index];
+
+    float lut_value = lut.ToFloat();
+    float lut_diff = lut.DiffToFloat();
+
+    return lut_value + lut_diff * delta;
+}
+
+std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
+    const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state,
+    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view) {
+
+    // TODO(Subv): Bump mapping
+    Math::Vec3<float> surface_normal = {0.0f, 0.0f, 1.0f};
+
+    if (lighting.config0.bump_mode != LightingRegs::LightingBumpMode::None) {
+        LOG_CRITICAL(HW_GPU, "unimplemented bump mapping");
+        UNIMPLEMENTED();
+    }
+
+    // Use the normalized the quaternion when performing the rotation
+    auto normal = Math::QuaternionRotate(normquat, surface_normal);
+
+    Math::Vec4<float> diffuse_sum = {0.0f, 0.0f, 0.0f, 1.0f};
+    Math::Vec4<float> specular_sum = {0.0f, 0.0f, 0.0f, 1.0f};
+
+    for (unsigned light_index = 0; light_index <= lighting.max_light_index; ++light_index) {
+        unsigned num = lighting.light_enable.GetNum(light_index);
+        const auto& light_config = lighting.light[num];
+
+        Math::Vec3<float> refl_value = {};
+        Math::Vec3<float> position = {float16::FromRaw(light_config.x).ToFloat32(),
+                                      float16::FromRaw(light_config.y).ToFloat32(),
+                                      float16::FromRaw(light_config.z).ToFloat32()};
+        Math::Vec3<float> light_vector;
+
+        if (light_config.config.directional)
+            light_vector = position;
+        else
+            light_vector = position + view;
+
+        light_vector.Normalize();
+
+        float dist_atten = 1.0f;
+        if (!lighting.IsDistAttenDisabled(num)) {
+            auto distance = (-view - position).Length();
+            float scale = Pica::float20::FromRaw(light_config.dist_atten_scale).ToFloat32();
+            float bias = Pica::float20::FromRaw(light_config.dist_atten_bias).ToFloat32();
+            size_t lut =
+                static_cast<size_t>(LightingRegs::LightingSampler::DistanceAttenuation) + num;
+
+            float sample_loc = MathUtil::Clamp(scale * distance + bias, 0.0f, 1.0f);
+
+            u8 lutindex =
+                static_cast<u8>(MathUtil::Clamp(std::floor(sample_loc * 256.0f), 0.0f, 255.0f));
+            float delta = sample_loc * 256 - lutindex;
+            dist_atten = LookupLightingLut(lighting_state, lut, lutindex, delta);
+        }
+
+        auto GetLutValue = [&](LightingRegs::LightingLutInput input, bool abs,
+                               LightingRegs::LightingScale scale_enum,
+                               LightingRegs::LightingSampler sampler) {
+            Math::Vec3<float> norm_view = view.Normalized();
+            Math::Vec3<float> half_angle = (norm_view + light_vector).Normalized();
+            float result = 0.0f;
+
+            switch (input) {
+            case LightingRegs::LightingLutInput::NH:
+                result = Math::Dot(normal, half_angle);
+                break;
+
+            case LightingRegs::LightingLutInput::VH:
+                result = Math::Dot(norm_view, half_angle);
+                break;
+
+            case LightingRegs::LightingLutInput::NV:
+                result = Math::Dot(normal, norm_view);
+                break;
+
+            case LightingRegs::LightingLutInput::LN:
+                result = Math::Dot(light_vector, normal);
+                break;
+
+            default:
+                LOG_CRITICAL(HW_GPU, "Unknown lighting LUT input %u\n", static_cast<u32>(input));
+                UNIMPLEMENTED();
+                result = 0.0f;
+            }
+
+            u8 index;
+            float delta;
+
+            if (abs) {
+                if (light_config.config.two_sided_diffuse)
+                    result = std::abs(result);
+                else
+                    result = std::max(result, 0.0f);
+
+                float flr = std::floor(result * 256.0f);
+                index = static_cast<u8>(MathUtil::Clamp(flr, 0.0f, 255.0f));
+                delta = result * 256 - index;
+            } else {
+                float flr = std::floor(result * 128.0f);
+                s8 signed_index = static_cast<s8>(MathUtil::Clamp(flr, -128.0f, 127.0f));
+                delta = result * 128.0f - signed_index;
+                index = static_cast<u8>(signed_index);
+            }
+
+            float scale = lighting.lut_scale.GetScale(scale_enum);
+            return scale *
+                   LookupLightingLut(lighting_state, static_cast<size_t>(sampler), index, delta);
+        };
+
+        // Specular 0 component
+        float d0_lut_value = 1.0f;
+        if (lighting.config1.disable_lut_d0 == 0 &&
+            LightingRegs::IsLightingSamplerSupported(
+                lighting.config0.config, LightingRegs::LightingSampler::Distribution0)) {
+            d0_lut_value =
+                GetLutValue(lighting.lut_input.d0, lighting.abs_lut_input.disable_d0 == 0,
+                            lighting.lut_scale.d0, LightingRegs::LightingSampler::Distribution0);
+        }
+
+        Math::Vec3<float> specular_0 = d0_lut_value * light_config.specular_0.ToVec3f();
+
+        // If enabled, lookup ReflectRed value, otherwise, 1.0 is used
+        if (lighting.config1.disable_lut_rr == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::ReflectRed)) {
+            refl_value.x =
+                GetLutValue(lighting.lut_input.rr, lighting.abs_lut_input.disable_rr == 0,
+                            lighting.lut_scale.rr, LightingRegs::LightingSampler::ReflectRed);
+        } else {
+            refl_value.x = 1.0f;
+        }
+
+        // If enabled, lookup ReflectGreen value, otherwise, ReflectRed value is used
+        if (lighting.config1.disable_lut_rg == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::ReflectGreen)) {
+            refl_value.y =
+                GetLutValue(lighting.lut_input.rg, lighting.abs_lut_input.disable_rg == 0,
+                            lighting.lut_scale.rg, LightingRegs::LightingSampler::ReflectGreen);
+        } else {
+            refl_value.y = refl_value.x;
+        }
+
+        // If enabled, lookup ReflectBlue value, otherwise, ReflectRed value is used
+        if (lighting.config1.disable_lut_rb == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::ReflectBlue)) {
+            refl_value.z =
+                GetLutValue(lighting.lut_input.rb, lighting.abs_lut_input.disable_rb == 0,
+                            lighting.lut_scale.rb, LightingRegs::LightingSampler::ReflectBlue);
+        } else {
+            refl_value.z = refl_value.x;
+        }
+
+        // Specular 1 component
+        float d1_lut_value = 1.0f;
+        if (lighting.config1.disable_lut_d1 == 0 &&
+            LightingRegs::IsLightingSamplerSupported(
+                lighting.config0.config, LightingRegs::LightingSampler::Distribution1)) {
+            d1_lut_value =
+                GetLutValue(lighting.lut_input.d1, lighting.abs_lut_input.disable_d1 == 0,
+                            lighting.lut_scale.d1, LightingRegs::LightingSampler::Distribution1);
+        }
+
+        Math::Vec3<float> specular_1 =
+            d1_lut_value * refl_value * light_config.specular_1.ToVec3f();
+
+        // Fresnel
+        if (lighting.config1.disable_lut_fr == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::Fresnel)) {
+
+            float lut_value =
+                GetLutValue(lighting.lut_input.fr, lighting.abs_lut_input.disable_fr == 0,
+                            lighting.lut_scale.fr, LightingRegs::LightingSampler::Fresnel);
+
+            // Enabled for diffuse lighting alpha component
+            if (lighting.config0.fresnel_selector ==
+                    LightingRegs::LightingFresnelSelector::PrimaryAlpha ||
+                lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
+                diffuse_sum.a() *= lut_value;
+            }
+
+            // Enabled for the specular lighting alpha component
+            if (lighting.config0.fresnel_selector ==
+                    LightingRegs::LightingFresnelSelector::SecondaryAlpha ||
+                lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
+                specular_sum.a() *= lut_value;
+            }
+        }
+
+        auto dot_product = Math::Dot(light_vector, normal);
+
+        // Calculate clamp highlights before applying the two-sided diffuse configuration to the dot
+        // product.
+        float clamp_highlights = 1.0f;
+        if (lighting.config0.clamp_highlights) {
+            if (dot_product <= 0.0f)
+                clamp_highlights = 0.0f;
+            else
+                clamp_highlights = 1.0f;
+        }
+
+        if (light_config.config.two_sided_diffuse)
+            dot_product = std::abs(dot_product);
+        else
+            dot_product = std::max(dot_product, 0.0f);
+
+        auto diffuse =
+            light_config.diffuse.ToVec3f() * dot_product + light_config.ambient.ToVec3f();
+        diffuse_sum += Math::MakeVec(diffuse * dist_atten, 0.0f);
+
+        specular_sum +=
+            Math::MakeVec((specular_0 + specular_1) * clamp_highlights * dist_atten, 0.0f);
+    }
+
+    diffuse_sum += Math::MakeVec(lighting.global_ambient.ToVec3f(), 0.0f);
+
+    auto diffuse = Math::MakeVec<float>(MathUtil::Clamp(diffuse_sum.x, 0.0f, 1.0f) * 255,
+                                        MathUtil::Clamp(diffuse_sum.y, 0.0f, 1.0f) * 255,
+                                        MathUtil::Clamp(diffuse_sum.z, 0.0f, 1.0f) * 255,
+                                        MathUtil::Clamp(diffuse_sum.w, 0.0f, 1.0f) * 255)
+                       .Cast<u8>();
+    auto specular = Math::MakeVec<float>(MathUtil::Clamp(specular_sum.x, 0.0f, 1.0f) * 255,
+                                         MathUtil::Clamp(specular_sum.y, 0.0f, 1.0f) * 255,
+                                         MathUtil::Clamp(specular_sum.z, 0.0f, 1.0f) * 255,
+                                         MathUtil::Clamp(specular_sum.w, 0.0f, 1.0f) * 255)
+                        .Cast<u8>();
+    return {diffuse, specular};
+}
+
+} // namespace Pica
diff --git a/src/video_core/swrasterizer/fragment_lighting.h b/src/video_core/swrasterizer/fragment_lighting.h
new file mode 100644
index 000000000..438dca926
--- /dev/null
+++ b/src/video_core/swrasterizer/fragment_lighting.h
@@ -0,0 +1,18 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <tuple>
+#include "common/quaternion.h"
+#include "common/vector_math.h"
+#include "video_core/pica_state.h"
+
+namespace Pica {
+
+std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
+    const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state,
+    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view);
+
+} // namespace Pica
diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index aee630954..bc7e1c56c 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -24,6 +24,7 @@
 #include "video_core/regs_rasterizer.h"
 #include "video_core/regs_texturing.h"
 #include "video_core/shader/shader.h"
+#include "video_core/swrasterizer/fragment_lighting.h"
 #include "video_core/swrasterizer/framebuffer.h"
 #include "video_core/swrasterizer/proctex.h"
 #include "video_core/swrasterizer/rasterizer.h"
@@ -115,246 +116,6 @@ static std::tuple<float24, float24, PAddr> ConvertCubeCoord(float24 u, float24 v
     return std::make_tuple(x / z * half + half, y / z * half + half, addr);
 }
 
-static float LookupLightingLut(const Pica::State::Lighting& lighting, size_t lut_index, u8 index,
-                               float delta) {
-    ASSERT_MSG(lut_index < lighting.luts.size(), "Out of range lut");
-    ASSERT_MSG(index < lighting.luts[lut_index].size(), "Out of range index");
-
-    const auto& lut = lighting.luts[lut_index][index];
-
-    float lut_value = lut.ToFloat();
-    float lut_diff = lut.DiffToFloat();
-
-    return lut_value + lut_diff * delta;
-}
-
-std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
-    const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state,
-    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view) {
-
-    // TODO(Subv): Bump mapping
-    Math::Vec3<float> surface_normal = {0.0f, 0.0f, 1.0f};
-
-    if (lighting.config0.bump_mode != LightingRegs::LightingBumpMode::None) {
-        LOG_CRITICAL(HW_GPU, "unimplemented bump mapping");
-        UNIMPLEMENTED();
-    }
-
-    // Use the normalized the quaternion when performing the rotation
-    auto normal = Math::QuaternionRotate(normquat, surface_normal);
-
-    Math::Vec4<float> diffuse_sum = {0.0f, 0.0f, 0.0f, 1.0f};
-    Math::Vec4<float> specular_sum = {0.0f, 0.0f, 0.0f, 1.0f};
-
-    for (unsigned light_index = 0; light_index <= lighting.max_light_index; ++light_index) {
-        unsigned num = lighting.light_enable.GetNum(light_index);
-        const auto& light_config = lighting.light[num];
-
-        Math::Vec3<float> refl_value = {};
-        Math::Vec3<float> position = {float16::FromRaw(light_config.x).ToFloat32(),
-                                      float16::FromRaw(light_config.y).ToFloat32(),
-                                      float16::FromRaw(light_config.z).ToFloat32()};
-        Math::Vec3<float> light_vector;
-
-        if (light_config.config.directional)
-            light_vector = position;
-        else
-            light_vector = position + view;
-
-        light_vector.Normalize();
-
-        float dist_atten = 1.0f;
-        if (!lighting.IsDistAttenDisabled(num)) {
-            auto distance = (-view - position).Length();
-            float scale = Pica::float20::FromRaw(light_config.dist_atten_scale).ToFloat32();
-            float bias = Pica::float20::FromRaw(light_config.dist_atten_bias).ToFloat32();
-            size_t lut =
-                static_cast<size_t>(LightingRegs::LightingSampler::DistanceAttenuation) + num;
-
-            float sample_loc = MathUtil::Clamp(scale * distance + bias, 0.0f, 1.0f);
-
-            u8 lutindex =
-                static_cast<u8>(MathUtil::Clamp(std::floor(sample_loc * 256.0f), 0.0f, 255.0f));
-            float delta = sample_loc * 256 - lutindex;
-            dist_atten = LookupLightingLut(lighting_state, lut, lutindex, delta);
-        }
-
-        auto GetLutValue = [&](LightingRegs::LightingLutInput input, bool abs,
-                               LightingRegs::LightingScale scale_enum,
-                               LightingRegs::LightingSampler sampler) {
-            Math::Vec3<float> norm_view = view.Normalized();
-            Math::Vec3<float> half_angle = (norm_view + light_vector).Normalized();
-            float result = 0.0f;
-
-            switch (input) {
-            case LightingRegs::LightingLutInput::NH:
-                result = Math::Dot(normal, half_angle);
-                break;
-
-            case LightingRegs::LightingLutInput::VH:
-                result = Math::Dot(norm_view, half_angle);
-                break;
-
-            case LightingRegs::LightingLutInput::NV:
-                result = Math::Dot(normal, norm_view);
-                break;
-
-            case LightingRegs::LightingLutInput::LN:
-                result = Math::Dot(light_vector, normal);
-                break;
-
-            default:
-                LOG_CRITICAL(HW_GPU, "Unknown lighting LUT input %u\n", static_cast<u32>(input));
-                UNIMPLEMENTED();
-                result = 0.0f;
-            }
-
-            u8 index;
-            float delta;
-
-            if (abs) {
-                if (light_config.config.two_sided_diffuse)
-                    result = std::abs(result);
-                else
-                    result = std::max(result, 0.0f);
-
-                float flr = std::floor(result * 256.0f);
-                index = static_cast<u8>(MathUtil::Clamp(flr, 0.0f, 255.0f));
-                delta = result * 256 - index;
-            } else {
-                float flr = std::floor(result * 128.0f);
-                s8 signed_index = static_cast<s8>(MathUtil::Clamp(flr, -128.0f, 127.0f));
-                delta = result * 128.0f - signed_index;
-                index = static_cast<u8>(signed_index);
-            }
-
-            float scale = lighting.lut_scale.GetScale(scale_enum);
-            return scale *
-                   LookupLightingLut(lighting_state, static_cast<size_t>(sampler), index, delta);
-        };
-
-        // Specular 0 component
-        float d0_lut_value = 1.0f;
-        if (lighting.config1.disable_lut_d0 == 0 &&
-            LightingRegs::IsLightingSamplerSupported(
-                lighting.config0.config, LightingRegs::LightingSampler::Distribution0)) {
-            d0_lut_value =
-                GetLutValue(lighting.lut_input.d0, lighting.abs_lut_input.disable_d0 == 0,
-                            lighting.lut_scale.d0, LightingRegs::LightingSampler::Distribution0);
-        }
-
-        Math::Vec3<float> specular_0 = d0_lut_value * light_config.specular_0.ToVec3f();
-
-        // If enabled, lookup ReflectRed value, otherwise, 1.0 is used
-        if (lighting.config1.disable_lut_rr == 0 &&
-            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
-                                                     LightingRegs::LightingSampler::ReflectRed)) {
-            refl_value.x =
-                GetLutValue(lighting.lut_input.rr, lighting.abs_lut_input.disable_rr == 0,
-                            lighting.lut_scale.rr, LightingRegs::LightingSampler::ReflectRed);
-        } else {
-            refl_value.x = 1.0f;
-        }
-
-        // If enabled, lookup ReflectGreen value, otherwise, ReflectRed value is used
-        if (lighting.config1.disable_lut_rg == 0 &&
-            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
-                                                     LightingRegs::LightingSampler::ReflectGreen)) {
-            refl_value.y =
-                GetLutValue(lighting.lut_input.rg, lighting.abs_lut_input.disable_rg == 0,
-                            lighting.lut_scale.rg, LightingRegs::LightingSampler::ReflectGreen);
-        } else {
-            refl_value.y = refl_value.x;
-        }
-
-        // If enabled, lookup ReflectBlue value, otherwise, ReflectRed value is used
-        if (lighting.config1.disable_lut_rb == 0 &&
-            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
-                                                     LightingRegs::LightingSampler::ReflectBlue)) {
-            refl_value.z =
-                GetLutValue(lighting.lut_input.rb, lighting.abs_lut_input.disable_rb == 0,
-                            lighting.lut_scale.rb, LightingRegs::LightingSampler::ReflectBlue);
-        } else {
-            refl_value.z = refl_value.x;
-        }
-
-        // Specular 1 component
-        float d1_lut_value = 1.0f;
-        if (lighting.config1.disable_lut_d1 == 0 &&
-            LightingRegs::IsLightingSamplerSupported(
-                lighting.config0.config, LightingRegs::LightingSampler::Distribution1)) {
-            d1_lut_value =
-                GetLutValue(lighting.lut_input.d1, lighting.abs_lut_input.disable_d1 == 0,
-                            lighting.lut_scale.d1, LightingRegs::LightingSampler::Distribution1);
-        }
-
-        Math::Vec3<float> specular_1 =
-            d1_lut_value * refl_value * light_config.specular_1.ToVec3f();
-
-        // Fresnel
-        if (lighting.config1.disable_lut_fr == 0 &&
-            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
-                                                     LightingRegs::LightingSampler::Fresnel)) {
-
-            float lut_value =
-                GetLutValue(lighting.lut_input.fr, lighting.abs_lut_input.disable_fr == 0,
-                            lighting.lut_scale.fr, LightingRegs::LightingSampler::Fresnel);
-
-            // Enabled for diffuse lighting alpha component
-            if (lighting.config0.fresnel_selector ==
-                    LightingRegs::LightingFresnelSelector::PrimaryAlpha ||
-                lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
-                diffuse_sum.a() *= lut_value;
-            }
-
-            // Enabled for the specular lighting alpha component
-            if (lighting.config0.fresnel_selector ==
-                    LightingRegs::LightingFresnelSelector::SecondaryAlpha ||
-                lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
-                specular_sum.a() *= lut_value;
-            }
-        }
-
-        auto dot_product = Math::Dot(light_vector, normal);
-
-        // Calculate clamp highlights before applying the two-sided diffuse configuration to the dot
-        // product.
-        float clamp_highlights = 1.0f;
-        if (lighting.config0.clamp_highlights) {
-            if (dot_product <= 0.0f)
-                clamp_highlights = 0.0f;
-            else
-                clamp_highlights = 1.0f;
-        }
-
-        if (light_config.config.two_sided_diffuse)
-            dot_product = std::abs(dot_product);
-        else
-            dot_product = std::max(dot_product, 0.0f);
-
-        auto diffuse =
-            light_config.diffuse.ToVec3f() * dot_product + light_config.ambient.ToVec3f();
-        diffuse_sum += Math::MakeVec(diffuse * dist_atten, 0.0f);
-
-        specular_sum +=
-            Math::MakeVec((specular_0 + specular_1) * clamp_highlights * dist_atten, 0.0f);
-    }
-
-    diffuse_sum += Math::MakeVec(lighting.global_ambient.ToVec3f(), 0.0f);
-
-    auto diffuse = Math::MakeVec<float>(MathUtil::Clamp(diffuse_sum.x, 0.0f, 1.0f) * 255,
-                                        MathUtil::Clamp(diffuse_sum.y, 0.0f, 1.0f) * 255,
-                                        MathUtil::Clamp(diffuse_sum.z, 0.0f, 1.0f) * 255,
-                                        MathUtil::Clamp(diffuse_sum.w, 0.0f, 1.0f) * 255)
-                       .Cast<u8>();
-    auto specular = Math::MakeVec<float>(MathUtil::Clamp(specular_sum.x, 0.0f, 1.0f) * 255,
-                                         MathUtil::Clamp(specular_sum.y, 0.0f, 1.0f) * 255,
-                                         MathUtil::Clamp(specular_sum.z, 0.0f, 1.0f) * 255,
-                                         MathUtil::Clamp(specular_sum.w, 0.0f, 1.0f) * 255)
-                        .Cast<u8>();
-    return {diffuse, specular};
-}
-
 MICROPROFILE_DEFINE(GPU_Rasterization, "GPU", "Rasterization", MP_RGB(50, 50, 240));
 
 /**
-- 
cgit v1.2.3


From 2252a63f8036cdf2612243271ce29e6104f82825 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Thu, 3 Aug 2017 12:01:31 +0300
Subject: SwRasterizer/Lighting: shorten file name

---
 src/video_core/CMakeLists.txt                     |   4 +-
 src/video_core/swrasterizer/fragment_lighting.cpp | 250 ----------------------
 src/video_core/swrasterizer/fragment_lighting.h   |  18 --
 src/video_core/swrasterizer/lighting.cpp          | 250 ++++++++++++++++++++++
 src/video_core/swrasterizer/lighting.h            |  18 ++
 src/video_core/swrasterizer/rasterizer.cpp        |   2 +-
 6 files changed, 271 insertions(+), 271 deletions(-)
 delete mode 100644 src/video_core/swrasterizer/fragment_lighting.cpp
 delete mode 100644 src/video_core/swrasterizer/fragment_lighting.h
 create mode 100644 src/video_core/swrasterizer/lighting.cpp
 create mode 100644 src/video_core/swrasterizer/lighting.h

(limited to 'src/video_core')

diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index b2280f2ef..cffa4c952 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -14,8 +14,8 @@ set(SRCS
             shader/shader.cpp
             shader/shader_interpreter.cpp
             swrasterizer/clipper.cpp
-            swrasterizer/fragment_lighting.cpp
             swrasterizer/framebuffer.cpp
+            swrasterizer/lighting.cpp
             swrasterizer/proctex.cpp
             swrasterizer/rasterizer.cpp
             swrasterizer/swrasterizer.cpp
@@ -55,8 +55,8 @@ set(HEADERS
             shader/shader.h
             shader/shader_interpreter.h
             swrasterizer/clipper.h
-            swrasterizer/fragment_lighting.h
             swrasterizer/framebuffer.h
+            swrasterizer/lighting.h
             swrasterizer/proctex.h
             swrasterizer/rasterizer.h
             swrasterizer/swrasterizer.h
diff --git a/src/video_core/swrasterizer/fragment_lighting.cpp b/src/video_core/swrasterizer/fragment_lighting.cpp
deleted file mode 100644
index 45a86b5cd..000000000
--- a/src/video_core/swrasterizer/fragment_lighting.cpp
+++ /dev/null
@@ -1,250 +0,0 @@
-// Copyright 2017 Citra Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include "common/math_util.h"
-#include "video_core/swrasterizer/fragment_lighting.h"
-
-namespace Pica {
-
-static float LookupLightingLut(const Pica::State::Lighting& lighting, size_t lut_index, u8 index,
-                               float delta) {
-    ASSERT_MSG(lut_index < lighting.luts.size(), "Out of range lut");
-    ASSERT_MSG(index < lighting.luts[lut_index].size(), "Out of range index");
-
-    const auto& lut = lighting.luts[lut_index][index];
-
-    float lut_value = lut.ToFloat();
-    float lut_diff = lut.DiffToFloat();
-
-    return lut_value + lut_diff * delta;
-}
-
-std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
-    const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state,
-    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view) {
-
-    // TODO(Subv): Bump mapping
-    Math::Vec3<float> surface_normal = {0.0f, 0.0f, 1.0f};
-
-    if (lighting.config0.bump_mode != LightingRegs::LightingBumpMode::None) {
-        LOG_CRITICAL(HW_GPU, "unimplemented bump mapping");
-        UNIMPLEMENTED();
-    }
-
-    // Use the normalized the quaternion when performing the rotation
-    auto normal = Math::QuaternionRotate(normquat, surface_normal);
-
-    Math::Vec4<float> diffuse_sum = {0.0f, 0.0f, 0.0f, 1.0f};
-    Math::Vec4<float> specular_sum = {0.0f, 0.0f, 0.0f, 1.0f};
-
-    for (unsigned light_index = 0; light_index <= lighting.max_light_index; ++light_index) {
-        unsigned num = lighting.light_enable.GetNum(light_index);
-        const auto& light_config = lighting.light[num];
-
-        Math::Vec3<float> refl_value = {};
-        Math::Vec3<float> position = {float16::FromRaw(light_config.x).ToFloat32(),
-                                      float16::FromRaw(light_config.y).ToFloat32(),
-                                      float16::FromRaw(light_config.z).ToFloat32()};
-        Math::Vec3<float> light_vector;
-
-        if (light_config.config.directional)
-            light_vector = position;
-        else
-            light_vector = position + view;
-
-        light_vector.Normalize();
-
-        float dist_atten = 1.0f;
-        if (!lighting.IsDistAttenDisabled(num)) {
-            auto distance = (-view - position).Length();
-            float scale = Pica::float20::FromRaw(light_config.dist_atten_scale).ToFloat32();
-            float bias = Pica::float20::FromRaw(light_config.dist_atten_bias).ToFloat32();
-            size_t lut =
-                static_cast<size_t>(LightingRegs::LightingSampler::DistanceAttenuation) + num;
-
-            float sample_loc = MathUtil::Clamp(scale * distance + bias, 0.0f, 1.0f);
-
-            u8 lutindex =
-                static_cast<u8>(MathUtil::Clamp(std::floor(sample_loc * 256.0f), 0.0f, 255.0f));
-            float delta = sample_loc * 256 - lutindex;
-            dist_atten = LookupLightingLut(lighting_state, lut, lutindex, delta);
-        }
-
-        auto GetLutValue = [&](LightingRegs::LightingLutInput input, bool abs,
-                               LightingRegs::LightingScale scale_enum,
-                               LightingRegs::LightingSampler sampler) {
-            Math::Vec3<float> norm_view = view.Normalized();
-            Math::Vec3<float> half_angle = (norm_view + light_vector).Normalized();
-            float result = 0.0f;
-
-            switch (input) {
-            case LightingRegs::LightingLutInput::NH:
-                result = Math::Dot(normal, half_angle);
-                break;
-
-            case LightingRegs::LightingLutInput::VH:
-                result = Math::Dot(norm_view, half_angle);
-                break;
-
-            case LightingRegs::LightingLutInput::NV:
-                result = Math::Dot(normal, norm_view);
-                break;
-
-            case LightingRegs::LightingLutInput::LN:
-                result = Math::Dot(light_vector, normal);
-                break;
-
-            default:
-                LOG_CRITICAL(HW_GPU, "Unknown lighting LUT input %u\n", static_cast<u32>(input));
-                UNIMPLEMENTED();
-                result = 0.0f;
-            }
-
-            u8 index;
-            float delta;
-
-            if (abs) {
-                if (light_config.config.two_sided_diffuse)
-                    result = std::abs(result);
-                else
-                    result = std::max(result, 0.0f);
-
-                float flr = std::floor(result * 256.0f);
-                index = static_cast<u8>(MathUtil::Clamp(flr, 0.0f, 255.0f));
-                delta = result * 256 - index;
-            } else {
-                float flr = std::floor(result * 128.0f);
-                s8 signed_index = static_cast<s8>(MathUtil::Clamp(flr, -128.0f, 127.0f));
-                delta = result * 128.0f - signed_index;
-                index = static_cast<u8>(signed_index);
-            }
-
-            float scale = lighting.lut_scale.GetScale(scale_enum);
-            return scale *
-                   LookupLightingLut(lighting_state, static_cast<size_t>(sampler), index, delta);
-        };
-
-        // Specular 0 component
-        float d0_lut_value = 1.0f;
-        if (lighting.config1.disable_lut_d0 == 0 &&
-            LightingRegs::IsLightingSamplerSupported(
-                lighting.config0.config, LightingRegs::LightingSampler::Distribution0)) {
-            d0_lut_value =
-                GetLutValue(lighting.lut_input.d0, lighting.abs_lut_input.disable_d0 == 0,
-                            lighting.lut_scale.d0, LightingRegs::LightingSampler::Distribution0);
-        }
-
-        Math::Vec3<float> specular_0 = d0_lut_value * light_config.specular_0.ToVec3f();
-
-        // If enabled, lookup ReflectRed value, otherwise, 1.0 is used
-        if (lighting.config1.disable_lut_rr == 0 &&
-            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
-                                                     LightingRegs::LightingSampler::ReflectRed)) {
-            refl_value.x =
-                GetLutValue(lighting.lut_input.rr, lighting.abs_lut_input.disable_rr == 0,
-                            lighting.lut_scale.rr, LightingRegs::LightingSampler::ReflectRed);
-        } else {
-            refl_value.x = 1.0f;
-        }
-
-        // If enabled, lookup ReflectGreen value, otherwise, ReflectRed value is used
-        if (lighting.config1.disable_lut_rg == 0 &&
-            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
-                                                     LightingRegs::LightingSampler::ReflectGreen)) {
-            refl_value.y =
-                GetLutValue(lighting.lut_input.rg, lighting.abs_lut_input.disable_rg == 0,
-                            lighting.lut_scale.rg, LightingRegs::LightingSampler::ReflectGreen);
-        } else {
-            refl_value.y = refl_value.x;
-        }
-
-        // If enabled, lookup ReflectBlue value, otherwise, ReflectRed value is used
-        if (lighting.config1.disable_lut_rb == 0 &&
-            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
-                                                     LightingRegs::LightingSampler::ReflectBlue)) {
-            refl_value.z =
-                GetLutValue(lighting.lut_input.rb, lighting.abs_lut_input.disable_rb == 0,
-                            lighting.lut_scale.rb, LightingRegs::LightingSampler::ReflectBlue);
-        } else {
-            refl_value.z = refl_value.x;
-        }
-
-        // Specular 1 component
-        float d1_lut_value = 1.0f;
-        if (lighting.config1.disable_lut_d1 == 0 &&
-            LightingRegs::IsLightingSamplerSupported(
-                lighting.config0.config, LightingRegs::LightingSampler::Distribution1)) {
-            d1_lut_value =
-                GetLutValue(lighting.lut_input.d1, lighting.abs_lut_input.disable_d1 == 0,
-                            lighting.lut_scale.d1, LightingRegs::LightingSampler::Distribution1);
-        }
-
-        Math::Vec3<float> specular_1 =
-            d1_lut_value * refl_value * light_config.specular_1.ToVec3f();
-
-        // Fresnel
-        if (lighting.config1.disable_lut_fr == 0 &&
-            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
-                                                     LightingRegs::LightingSampler::Fresnel)) {
-
-            float lut_value =
-                GetLutValue(lighting.lut_input.fr, lighting.abs_lut_input.disable_fr == 0,
-                            lighting.lut_scale.fr, LightingRegs::LightingSampler::Fresnel);
-
-            // Enabled for diffuse lighting alpha component
-            if (lighting.config0.fresnel_selector ==
-                    LightingRegs::LightingFresnelSelector::PrimaryAlpha ||
-                lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
-                diffuse_sum.a() *= lut_value;
-            }
-
-            // Enabled for the specular lighting alpha component
-            if (lighting.config0.fresnel_selector ==
-                    LightingRegs::LightingFresnelSelector::SecondaryAlpha ||
-                lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
-                specular_sum.a() *= lut_value;
-            }
-        }
-
-        auto dot_product = Math::Dot(light_vector, normal);
-
-        // Calculate clamp highlights before applying the two-sided diffuse configuration to the dot
-        // product.
-        float clamp_highlights = 1.0f;
-        if (lighting.config0.clamp_highlights) {
-            if (dot_product <= 0.0f)
-                clamp_highlights = 0.0f;
-            else
-                clamp_highlights = 1.0f;
-        }
-
-        if (light_config.config.two_sided_diffuse)
-            dot_product = std::abs(dot_product);
-        else
-            dot_product = std::max(dot_product, 0.0f);
-
-        auto diffuse =
-            light_config.diffuse.ToVec3f() * dot_product + light_config.ambient.ToVec3f();
-        diffuse_sum += Math::MakeVec(diffuse * dist_atten, 0.0f);
-
-        specular_sum +=
-            Math::MakeVec((specular_0 + specular_1) * clamp_highlights * dist_atten, 0.0f);
-    }
-
-    diffuse_sum += Math::MakeVec(lighting.global_ambient.ToVec3f(), 0.0f);
-
-    auto diffuse = Math::MakeVec<float>(MathUtil::Clamp(diffuse_sum.x, 0.0f, 1.0f) * 255,
-                                        MathUtil::Clamp(diffuse_sum.y, 0.0f, 1.0f) * 255,
-                                        MathUtil::Clamp(diffuse_sum.z, 0.0f, 1.0f) * 255,
-                                        MathUtil::Clamp(diffuse_sum.w, 0.0f, 1.0f) * 255)
-                       .Cast<u8>();
-    auto specular = Math::MakeVec<float>(MathUtil::Clamp(specular_sum.x, 0.0f, 1.0f) * 255,
-                                         MathUtil::Clamp(specular_sum.y, 0.0f, 1.0f) * 255,
-                                         MathUtil::Clamp(specular_sum.z, 0.0f, 1.0f) * 255,
-                                         MathUtil::Clamp(specular_sum.w, 0.0f, 1.0f) * 255)
-                        .Cast<u8>();
-    return {diffuse, specular};
-}
-
-} // namespace Pica
diff --git a/src/video_core/swrasterizer/fragment_lighting.h b/src/video_core/swrasterizer/fragment_lighting.h
deleted file mode 100644
index 438dca926..000000000
--- a/src/video_core/swrasterizer/fragment_lighting.h
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright 2017 Citra Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <tuple>
-#include "common/quaternion.h"
-#include "common/vector_math.h"
-#include "video_core/pica_state.h"
-
-namespace Pica {
-
-std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
-    const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state,
-    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view);
-
-} // namespace Pica
diff --git a/src/video_core/swrasterizer/lighting.cpp b/src/video_core/swrasterizer/lighting.cpp
new file mode 100644
index 000000000..63088eee8
--- /dev/null
+++ b/src/video_core/swrasterizer/lighting.cpp
@@ -0,0 +1,250 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/math_util.h"
+#include "video_core/swrasterizer/lighting.h"
+
+namespace Pica {
+
+static float LookupLightingLut(const Pica::State::Lighting& lighting, size_t lut_index, u8 index,
+                               float delta) {
+    ASSERT_MSG(lut_index < lighting.luts.size(), "Out of range lut");
+    ASSERT_MSG(index < lighting.luts[lut_index].size(), "Out of range index");
+
+    const auto& lut = lighting.luts[lut_index][index];
+
+    float lut_value = lut.ToFloat();
+    float lut_diff = lut.DiffToFloat();
+
+    return lut_value + lut_diff * delta;
+}
+
+std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
+    const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state,
+    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view) {
+
+    // TODO(Subv): Bump mapping
+    Math::Vec3<float> surface_normal = {0.0f, 0.0f, 1.0f};
+
+    if (lighting.config0.bump_mode != LightingRegs::LightingBumpMode::None) {
+        LOG_CRITICAL(HW_GPU, "unimplemented bump mapping");
+        UNIMPLEMENTED();
+    }
+
+    // Use the normalized the quaternion when performing the rotation
+    auto normal = Math::QuaternionRotate(normquat, surface_normal);
+
+    Math::Vec4<float> diffuse_sum = {0.0f, 0.0f, 0.0f, 1.0f};
+    Math::Vec4<float> specular_sum = {0.0f, 0.0f, 0.0f, 1.0f};
+
+    for (unsigned light_index = 0; light_index <= lighting.max_light_index; ++light_index) {
+        unsigned num = lighting.light_enable.GetNum(light_index);
+        const auto& light_config = lighting.light[num];
+
+        Math::Vec3<float> refl_value = {};
+        Math::Vec3<float> position = {float16::FromRaw(light_config.x).ToFloat32(),
+                                      float16::FromRaw(light_config.y).ToFloat32(),
+                                      float16::FromRaw(light_config.z).ToFloat32()};
+        Math::Vec3<float> light_vector;
+
+        if (light_config.config.directional)
+            light_vector = position;
+        else
+            light_vector = position + view;
+
+        light_vector.Normalize();
+
+        float dist_atten = 1.0f;
+        if (!lighting.IsDistAttenDisabled(num)) {
+            auto distance = (-view - position).Length();
+            float scale = Pica::float20::FromRaw(light_config.dist_atten_scale).ToFloat32();
+            float bias = Pica::float20::FromRaw(light_config.dist_atten_bias).ToFloat32();
+            size_t lut =
+                static_cast<size_t>(LightingRegs::LightingSampler::DistanceAttenuation) + num;
+
+            float sample_loc = MathUtil::Clamp(scale * distance + bias, 0.0f, 1.0f);
+
+            u8 lutindex =
+                static_cast<u8>(MathUtil::Clamp(std::floor(sample_loc * 256.0f), 0.0f, 255.0f));
+            float delta = sample_loc * 256 - lutindex;
+            dist_atten = LookupLightingLut(lighting_state, lut, lutindex, delta);
+        }
+
+        auto GetLutValue = [&](LightingRegs::LightingLutInput input, bool abs,
+                               LightingRegs::LightingScale scale_enum,
+                               LightingRegs::LightingSampler sampler) {
+            Math::Vec3<float> norm_view = view.Normalized();
+            Math::Vec3<float> half_angle = (norm_view + light_vector).Normalized();
+            float result = 0.0f;
+
+            switch (input) {
+            case LightingRegs::LightingLutInput::NH:
+                result = Math::Dot(normal, half_angle);
+                break;
+
+            case LightingRegs::LightingLutInput::VH:
+                result = Math::Dot(norm_view, half_angle);
+                break;
+
+            case LightingRegs::LightingLutInput::NV:
+                result = Math::Dot(normal, norm_view);
+                break;
+
+            case LightingRegs::LightingLutInput::LN:
+                result = Math::Dot(light_vector, normal);
+                break;
+
+            default:
+                LOG_CRITICAL(HW_GPU, "Unknown lighting LUT input %u\n", static_cast<u32>(input));
+                UNIMPLEMENTED();
+                result = 0.0f;
+            }
+
+            u8 index;
+            float delta;
+
+            if (abs) {
+                if (light_config.config.two_sided_diffuse)
+                    result = std::abs(result);
+                else
+                    result = std::max(result, 0.0f);
+
+                float flr = std::floor(result * 256.0f);
+                index = static_cast<u8>(MathUtil::Clamp(flr, 0.0f, 255.0f));
+                delta = result * 256 - index;
+            } else {
+                float flr = std::floor(result * 128.0f);
+                s8 signed_index = static_cast<s8>(MathUtil::Clamp(flr, -128.0f, 127.0f));
+                delta = result * 128.0f - signed_index;
+                index = static_cast<u8>(signed_index);
+            }
+
+            float scale = lighting.lut_scale.GetScale(scale_enum);
+            return scale *
+                   LookupLightingLut(lighting_state, static_cast<size_t>(sampler), index, delta);
+        };
+
+        // Specular 0 component
+        float d0_lut_value = 1.0f;
+        if (lighting.config1.disable_lut_d0 == 0 &&
+            LightingRegs::IsLightingSamplerSupported(
+                lighting.config0.config, LightingRegs::LightingSampler::Distribution0)) {
+            d0_lut_value =
+                GetLutValue(lighting.lut_input.d0, lighting.abs_lut_input.disable_d0 == 0,
+                            lighting.lut_scale.d0, LightingRegs::LightingSampler::Distribution0);
+        }
+
+        Math::Vec3<float> specular_0 = d0_lut_value * light_config.specular_0.ToVec3f();
+
+        // If enabled, lookup ReflectRed value, otherwise, 1.0 is used
+        if (lighting.config1.disable_lut_rr == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::ReflectRed)) {
+            refl_value.x =
+                GetLutValue(lighting.lut_input.rr, lighting.abs_lut_input.disable_rr == 0,
+                            lighting.lut_scale.rr, LightingRegs::LightingSampler::ReflectRed);
+        } else {
+            refl_value.x = 1.0f;
+        }
+
+        // If enabled, lookup ReflectGreen value, otherwise, ReflectRed value is used
+        if (lighting.config1.disable_lut_rg == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::ReflectGreen)) {
+            refl_value.y =
+                GetLutValue(lighting.lut_input.rg, lighting.abs_lut_input.disable_rg == 0,
+                            lighting.lut_scale.rg, LightingRegs::LightingSampler::ReflectGreen);
+        } else {
+            refl_value.y = refl_value.x;
+        }
+
+        // If enabled, lookup ReflectBlue value, otherwise, ReflectRed value is used
+        if (lighting.config1.disable_lut_rb == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::ReflectBlue)) {
+            refl_value.z =
+                GetLutValue(lighting.lut_input.rb, lighting.abs_lut_input.disable_rb == 0,
+                            lighting.lut_scale.rb, LightingRegs::LightingSampler::ReflectBlue);
+        } else {
+            refl_value.z = refl_value.x;
+        }
+
+        // Specular 1 component
+        float d1_lut_value = 1.0f;
+        if (lighting.config1.disable_lut_d1 == 0 &&
+            LightingRegs::IsLightingSamplerSupported(
+                lighting.config0.config, LightingRegs::LightingSampler::Distribution1)) {
+            d1_lut_value =
+                GetLutValue(lighting.lut_input.d1, lighting.abs_lut_input.disable_d1 == 0,
+                            lighting.lut_scale.d1, LightingRegs::LightingSampler::Distribution1);
+        }
+
+        Math::Vec3<float> specular_1 =
+            d1_lut_value * refl_value * light_config.specular_1.ToVec3f();
+
+        // Fresnel
+        if (lighting.config1.disable_lut_fr == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::Fresnel)) {
+
+            float lut_value =
+                GetLutValue(lighting.lut_input.fr, lighting.abs_lut_input.disable_fr == 0,
+                            lighting.lut_scale.fr, LightingRegs::LightingSampler::Fresnel);
+
+            // Enabled for diffuse lighting alpha component
+            if (lighting.config0.fresnel_selector ==
+                    LightingRegs::LightingFresnelSelector::PrimaryAlpha ||
+                lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
+                diffuse_sum.a() *= lut_value;
+            }
+
+            // Enabled for the specular lighting alpha component
+            if (lighting.config0.fresnel_selector ==
+                    LightingRegs::LightingFresnelSelector::SecondaryAlpha ||
+                lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
+                specular_sum.a() *= lut_value;
+            }
+        }
+
+        auto dot_product = Math::Dot(light_vector, normal);
+
+        // Calculate clamp highlights before applying the two-sided diffuse configuration to the dot
+        // product.
+        float clamp_highlights = 1.0f;
+        if (lighting.config0.clamp_highlights) {
+            if (dot_product <= 0.0f)
+                clamp_highlights = 0.0f;
+            else
+                clamp_highlights = 1.0f;
+        }
+
+        if (light_config.config.two_sided_diffuse)
+            dot_product = std::abs(dot_product);
+        else
+            dot_product = std::max(dot_product, 0.0f);
+
+        auto diffuse =
+            light_config.diffuse.ToVec3f() * dot_product + light_config.ambient.ToVec3f();
+        diffuse_sum += Math::MakeVec(diffuse * dist_atten, 0.0f);
+
+        specular_sum +=
+            Math::MakeVec((specular_0 + specular_1) * clamp_highlights * dist_atten, 0.0f);
+    }
+
+    diffuse_sum += Math::MakeVec(lighting.global_ambient.ToVec3f(), 0.0f);
+
+    auto diffuse = Math::MakeVec<float>(MathUtil::Clamp(diffuse_sum.x, 0.0f, 1.0f) * 255,
+                                        MathUtil::Clamp(diffuse_sum.y, 0.0f, 1.0f) * 255,
+                                        MathUtil::Clamp(diffuse_sum.z, 0.0f, 1.0f) * 255,
+                                        MathUtil::Clamp(diffuse_sum.w, 0.0f, 1.0f) * 255)
+                       .Cast<u8>();
+    auto specular = Math::MakeVec<float>(MathUtil::Clamp(specular_sum.x, 0.0f, 1.0f) * 255,
+                                         MathUtil::Clamp(specular_sum.y, 0.0f, 1.0f) * 255,
+                                         MathUtil::Clamp(specular_sum.z, 0.0f, 1.0f) * 255,
+                                         MathUtil::Clamp(specular_sum.w, 0.0f, 1.0f) * 255)
+                        .Cast<u8>();
+    return {diffuse, specular};
+}
+
+} // namespace Pica
diff --git a/src/video_core/swrasterizer/lighting.h b/src/video_core/swrasterizer/lighting.h
new file mode 100644
index 000000000..438dca926
--- /dev/null
+++ b/src/video_core/swrasterizer/lighting.h
@@ -0,0 +1,18 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <tuple>
+#include "common/quaternion.h"
+#include "common/vector_math.h"
+#include "video_core/pica_state.h"
+
+namespace Pica {
+
+std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
+    const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state,
+    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view);
+
+} // namespace Pica
diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index bc7e1c56c..fdc1df199 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -24,8 +24,8 @@
 #include "video_core/regs_rasterizer.h"
 #include "video_core/regs_texturing.h"
 #include "video_core/shader/shader.h"
-#include "video_core/swrasterizer/fragment_lighting.h"
 #include "video_core/swrasterizer/framebuffer.h"
+#include "video_core/swrasterizer/lighting.h"
 #include "video_core/swrasterizer/proctex.h"
 #include "video_core/swrasterizer/rasterizer.h"
 #include "video_core/swrasterizer/texturing.h"
-- 
cgit v1.2.3


From baa24f4ea9d9c4d7c1bd60ba8a6fc188dfa9cc8f Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Thu, 3 Aug 2017 01:40:42 +0300
Subject: pica: upload shared shader code to both unit

---
 src/video_core/command_processor.cpp | 62 +++++++++++++++++++++---------------
 src/video_core/regs_pipeline.h       |  9 +++++-
 2 files changed, 45 insertions(+), 26 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 4633a1df1..f98ca3302 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -119,27 +119,6 @@ static void WriteUniformFloatReg(ShaderRegs& config, Shader::ShaderSetup& setup,
     }
 }
 
-static void WriteProgramCode(ShaderRegs& config, Shader::ShaderSetup& setup,
-                             unsigned max_program_code_length, u32 value) {
-    if (config.program.offset >= max_program_code_length) {
-        LOG_ERROR(HW_GPU, "Invalid %s program offset %d", GetShaderSetupTypeName(setup),
-                  (int)config.program.offset);
-    } else {
-        setup.program_code[config.program.offset] = value;
-        config.program.offset++;
-    }
-}
-
-static void WriteSwizzlePatterns(ShaderRegs& config, Shader::ShaderSetup& setup, u32 value) {
-    if (config.swizzle_patterns.offset >= setup.swizzle_data.size()) {
-        LOG_ERROR(HW_GPU, "Invalid %s swizzle pattern offset %d", GetShaderSetupTypeName(setup),
-                  (int)config.swizzle_patterns.offset);
-    } else {
-        setup.swizzle_data[config.swizzle_patterns.offset] = value;
-        config.swizzle_patterns.offset++;
-    }
-}
-
 static void WritePicaReg(u32 id, u32 value, u32 mask) {
     auto& regs = g_state.regs;
 
@@ -458,7 +437,13 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[5], 0x2a1):
     case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[6], 0x2a2):
     case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[7], 0x2a3): {
-        WriteProgramCode(g_state.regs.gs, g_state.gs, 4096, value);
+        u32& offset = g_state.regs.gs.program.offset;
+        if (offset >= 4096) {
+            LOG_ERROR(HW_GPU, "Invalid GS program offset %u", offset);
+        } else {
+            g_state.gs.program_code[offset] = value;
+            offset++;
+        }
         break;
     }
 
@@ -470,11 +455,18 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[5], 0x2ab):
     case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[6], 0x2ac):
     case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[7], 0x2ad): {
-        WriteSwizzlePatterns(g_state.regs.gs, g_state.gs, value);
+        u32& offset = g_state.regs.gs.swizzle_patterns.offset;
+        if (offset >= g_state.gs.swizzle_data.size()) {
+            LOG_ERROR(HW_GPU, "Invalid GS swizzle pattern offset %u", offset);
+        } else {
+            g_state.gs.swizzle_data[offset] = value;
+            offset++;
+        }
         break;
     }
 
     case PICA_REG_INDEX(vs.bool_uniforms):
+        // TODO (wwylele): does regs.pipeline.gs_unit_exclusive_configuration affect this?
         WriteUniformBoolReg(g_state.vs, g_state.regs.vs.bool_uniforms.Value());
         break;
 
@@ -482,6 +474,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[1], 0x2b2):
     case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[2], 0x2b3):
     case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[3], 0x2b4): {
+        // TODO (wwylele): does regs.pipeline.gs_unit_exclusive_configuration affect this?
         unsigned index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1));
         auto values = regs.vs.int_uniforms[index];
         WriteUniformIntReg(g_state.vs, index,
@@ -497,6 +490,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[5], 0x2c6):
     case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[6], 0x2c7):
     case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[7], 0x2c8): {
+        // TODO (wwylele): does regs.pipeline.gs_unit_exclusive_configuration affect this?
         WriteUniformFloatReg(g_state.regs.vs, g_state.vs, vs_float_regs_counter,
                              vs_uniform_write_buffer, value);
         break;
@@ -510,7 +504,16 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[5], 0x2d1):
     case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[6], 0x2d2):
     case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[7], 0x2d3): {
-        WriteProgramCode(g_state.regs.vs, g_state.vs, 512, value);
+        u32& offset = g_state.regs.vs.program.offset;
+        if (offset >= 512) {
+            LOG_ERROR(HW_GPU, "Invalid VS program offset %u", offset);
+        } else {
+            g_state.vs.program_code[offset] = value;
+            if (!g_state.regs.pipeline.gs_unit_exclusive_configuration) {
+                g_state.gs.program_code[offset] = value;
+            }
+            offset++;
+        }
         break;
     }
 
@@ -522,7 +525,16 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[5], 0x2db):
     case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[6], 0x2dc):
     case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[7], 0x2dd): {
-        WriteSwizzlePatterns(g_state.regs.vs, g_state.vs, value);
+        u32& offset = g_state.regs.vs.swizzle_patterns.offset;
+        if (offset >= g_state.vs.swizzle_data.size()) {
+            LOG_ERROR(HW_GPU, "Invalid VS swizzle pattern offset %u", offset);
+        } else {
+            g_state.vs.swizzle_data[offset] = value;
+            if (!g_state.regs.pipeline.gs_unit_exclusive_configuration) {
+                g_state.gs.swizzle_data[offset] = value;
+            }
+            offset++;
+        }
         break;
     }
 
diff --git a/src/video_core/regs_pipeline.h b/src/video_core/regs_pipeline.h
index 31c747d77..8b6369297 100644
--- a/src/video_core/regs_pipeline.h
+++ b/src/video_core/regs_pipeline.h
@@ -202,7 +202,14 @@ struct PipelineRegs {
     /// Number of input attributes to the vertex shader minus 1
     BitField<0, 4, u32> max_input_attrib_index;
 
-    INSERT_PADDING_WORDS(2);
+    INSERT_PADDING_WORDS(1);
+
+    // The shader unit 3, which can be used for both vertex and geometry shader, gets its
+    // configuration depending on this register. If this is not set, unit 3 will share some
+    // configuration with other units. It is known that program code and swizzle pattern uploaded
+    // via regs.vs will be also uploaded to unit 3 if this is not set. Although very likely, it is
+    // still unclear whether uniforms and other configuration can be also shared.
+    BitField<0, 1, u32> gs_unit_exclusive_configuration;
 
     enum class GPUMode : u32 {
         Drawing = 0,
-- 
cgit v1.2.3


From db309b2423a996cb792273080e73906b07f8b45b Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Mon, 24 Jul 2017 14:13:33 +0300
Subject: pica/regs: layout geometry shader configuration regs

All the register meanings are derived from ctrulib (3dbrew is outdated for most of them)
---
 src/video_core/regs_pipeline.h | 34 ++++++++++++++++++++++++++++++++--
 src/video_core/regs_shader.h   |  7 +++++++
 2 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/regs_pipeline.h b/src/video_core/regs_pipeline.h
index 8b6369297..e78c3e331 100644
--- a/src/video_core/regs_pipeline.h
+++ b/src/video_core/regs_pipeline.h
@@ -147,7 +147,15 @@ struct PipelineRegs {
     // Number of vertices to render
     u32 num_vertices;
 
-    INSERT_PADDING_WORDS(0x1);
+    enum class UseGS : u32 {
+        No = 0,
+        Yes = 2,
+    };
+
+    union {
+        BitField<0, 2, UseGS> use_gs;
+        BitField<31, 1, u32> variable_primitive;
+    };
 
     // The index of the first vertex to render
     u32 vertex_offset;
@@ -218,7 +226,29 @@ struct PipelineRegs {
 
     GPUMode gpu_mode;
 
-    INSERT_PADDING_WORDS(0x18);
+    INSERT_PADDING_WORDS(0x4);
+    BitField<0, 4, u32> vs_outmap_total_minus_1_a;
+    INSERT_PADDING_WORDS(0x6);
+    BitField<0, 4, u32> vs_outmap_total_minus_1_b;
+
+    enum class GSMode : u32 {
+        Point = 0,
+        VariablePrimitive = 1,
+        FixedPrimitive = 2,
+    };
+
+    union {
+        BitField<0, 8, GSMode> mode;
+        BitField<8, 4, u32> fixed_vertex_num_minus_1;
+        BitField<12, 4, u32> stride_minus_1;
+        BitField<16, 4, u32> start_index;
+    } gs_config;
+
+    INSERT_PADDING_WORDS(0x1);
+
+    u32 variable_vertex_main_num_minus_1;
+
+    INSERT_PADDING_WORDS(0x9);
 
     enum class TriangleTopology : u32 {
         List = 0,
diff --git a/src/video_core/regs_shader.h b/src/video_core/regs_shader.h
index ddb1ee451..c15d4d162 100644
--- a/src/video_core/regs_shader.h
+++ b/src/video_core/regs_shader.h
@@ -24,9 +24,16 @@ struct ShaderRegs {
 
     INSERT_PADDING_WORDS(0x4);
 
+    enum ShaderMode {
+        GS = 0x08,
+        VS = 0xA0,
+    };
+
     union {
         // Number of input attributes to shader unit - 1
         BitField<0, 4, u32> max_input_attribute_index;
+        BitField<8, 8, u32> input_to_uniform;
+        BitField<24, 8, ShaderMode> shader_mode;
     };
 
     // Offset to shader program entry point (in words)
-- 
cgit v1.2.3


From 5d9d42f0d0e0b2619e7412b86699a9b2b9dfa4ea Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Thu, 10 Aug 2017 11:56:55 +0300
Subject: SwRasterizer/Lighting: use make_tuple instead of constructor

implicit tuple constructor is a c++17 thing, which is not supported by some not-so-old libraries. Play safe for now
---
 src/video_core/swrasterizer/lighting.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/lighting.cpp b/src/video_core/swrasterizer/lighting.cpp
index 63088eee8..d61e6d572 100644
--- a/src/video_core/swrasterizer/lighting.cpp
+++ b/src/video_core/swrasterizer/lighting.cpp
@@ -244,7 +244,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
                                          MathUtil::Clamp(specular_sum.z, 0.0f, 1.0f) * 255,
                                          MathUtil::Clamp(specular_sum.w, 0.0f, 1.0f) * 255)
                         .Cast<u8>();
-    return {diffuse, specular};
+    return std::make_tuple(diffuse, specular);
 }
 
 } // namespace Pica
-- 
cgit v1.2.3


From 14ee32c46a6dc97c1c6a0597e72e5284bf4e86e6 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Fri, 11 Aug 2017 01:13:55 +0300
Subject: SwRasterizer/Lighting: implement geometric factor

---
 src/video_core/swrasterizer/lighting.cpp | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/lighting.cpp b/src/video_core/swrasterizer/lighting.cpp
index d61e6d572..91683afa4 100644
--- a/src/video_core/swrasterizer/lighting.cpp
+++ b/src/video_core/swrasterizer/lighting.cpp
@@ -55,6 +55,9 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
 
         light_vector.Normalize();
 
+        Math::Vec3<float> norm_view = view.Normalized();
+        Math::Vec3<float> half_vector = norm_view + light_vector;
+
         float dist_atten = 1.0f;
         if (!lighting.IsDistAttenDisabled(num)) {
             auto distance = (-view - position).Length();
@@ -74,17 +77,15 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
         auto GetLutValue = [&](LightingRegs::LightingLutInput input, bool abs,
                                LightingRegs::LightingScale scale_enum,
                                LightingRegs::LightingSampler sampler) {
-            Math::Vec3<float> norm_view = view.Normalized();
-            Math::Vec3<float> half_angle = (norm_view + light_vector).Normalized();
             float result = 0.0f;
 
             switch (input) {
             case LightingRegs::LightingLutInput::NH:
-                result = Math::Dot(normal, half_angle);
+                result = Math::Dot(normal, half_vector.Normalized());
                 break;
 
             case LightingRegs::LightingLutInput::VH:
-                result = Math::Dot(norm_view, half_angle);
+                result = Math::Dot(norm_view, half_vector.Normalized());
                 break;
 
             case LightingRegs::LightingLutInput::NV:
@@ -224,6 +225,17 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
         else
             dot_product = std::max(dot_product, 0.0f);
 
+        if (light_config.config.geometric_factor_0 || light_config.config.geometric_factor_1) {
+            float geo_factor = half_vector.Length2();
+            geo_factor = geo_factor == 0.0f ? 0.0f : std::min(dot_product / geo_factor, 1.0f);
+            if (light_config.config.geometric_factor_0) {
+                specular_0 *= geo_factor;
+            }
+            if (light_config.config.geometric_factor_1) {
+                specular_1 *= geo_factor;
+            }
+        }
+
         auto diffuse =
             light_config.diffuse.ToVec3f() * dot_product + light_config.ambient.ToVec3f();
         diffuse_sum += Math::MakeVec(diffuse * dist_atten, 0.0f);
-- 
cgit v1.2.3


From 945f9a1b04d51aff674e0b7061c29a04211a17bd Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Fri, 11 Aug 2017 00:41:37 +0300
Subject: SwRasterizer/Lighting: implement spot light

---
 src/video_core/swrasterizer/lighting.cpp | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/lighting.cpp b/src/video_core/swrasterizer/lighting.cpp
index d61e6d572..ffd35792a 100644
--- a/src/video_core/swrasterizer/lighting.cpp
+++ b/src/video_core/swrasterizer/lighting.cpp
@@ -95,6 +95,12 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
                 result = Math::Dot(light_vector, normal);
                 break;
 
+            case LightingRegs::LightingLutInput::SP: {
+                Math::Vec3<s32> spot_dir{light_config.spot_x.Value(), light_config.spot_y.Value(),
+                                         light_config.spot_z.Value()};
+                result = Math::Dot(light_vector, spot_dir.Cast<float>() / 2047.0f);
+                break;
+            }
             default:
                 LOG_CRITICAL(HW_GPU, "Unknown lighting LUT input %u\n", static_cast<u32>(input));
                 UNIMPLEMENTED();
@@ -125,6 +131,16 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
                    LookupLightingLut(lighting_state, static_cast<size_t>(sampler), index, delta);
         };
 
+        // If enabled, compute spot light attenuation value
+        float spot_atten = 1.0f;
+        if (!lighting.IsSpotAttenDisabled(num) &&
+            LightingRegs::IsLightingSamplerSupported(
+                lighting.config0.config, LightingRegs::LightingSampler::SpotlightAttenuation)) {
+            auto lut = LightingRegs::SpotlightAttenuationSampler(num);
+            spot_atten = GetLutValue(lighting.lut_input.sp, lighting.abs_lut_input.disable_sp == 0,
+                                     lighting.lut_scale.sp, lut);
+        }
+
         // Specular 0 component
         float d0_lut_value = 1.0f;
         if (lighting.config1.disable_lut_d0 == 0 &&
@@ -226,10 +242,10 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
 
         auto diffuse =
             light_config.diffuse.ToVec3f() * dot_product + light_config.ambient.ToVec3f();
-        diffuse_sum += Math::MakeVec(diffuse * dist_atten, 0.0f);
+        diffuse_sum += Math::MakeVec(diffuse * dist_atten * spot_atten, 0.0f);
 
-        specular_sum +=
-            Math::MakeVec((specular_0 + specular_1) * clamp_highlights * dist_atten, 0.0f);
+        specular_sum += Math::MakeVec(
+            (specular_0 + specular_1) * clamp_highlights * dist_atten * spot_atten, 0.0f);
     }
 
     diffuse_sum += Math::MakeVec(lighting.global_ambient.ToVec3f(), 0.0f);
-- 
cgit v1.2.3


From 686fb3e78cb394bb7db18fd951d104ca86d805d9 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Fri, 11 Aug 2017 18:24:24 +0300
Subject: gl_shader_gen: don't call SampleTexture when bump map is not used

---
 src/video_core/renderer_opengl/gl_shader_gen.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index bb192affd..ae67aab05 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -525,11 +525,12 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
            "float geo_factor = 1.0;\n";
 
     // Compute fragment normals and tangents
-    const std::string pertubation =
-        "2.0 * (" + SampleTexture(config, lighting.bump_selector) + ").rgb - 1.0";
+    auto Perturbation = [&]() {
+        return "2.0 * (" + SampleTexture(config, lighting.bump_selector) + ").rgb - 1.0";
+    };
     if (lighting.bump_mode == LightingRegs::LightingBumpMode::NormalMap) {
         // Bump mapping is enabled using a normal map
-        out += "vec3 surface_normal = " + pertubation + ";\n";
+        out += "vec3 surface_normal = " + Perturbation() + ";\n";
 
         // Recompute Z-component of perturbation if 'renorm' is enabled, this provides a higher
         // precision result
@@ -543,7 +544,7 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
         out += "vec3 surface_tangent = vec3(1.0, 0.0, 0.0);\n";
     } else if (lighting.bump_mode == LightingRegs::LightingBumpMode::TangentMap) {
         // Bump mapping is enabled using a tangent map
-        out += "vec3 surface_tangent = " + pertubation + ";\n";
+        out += "vec3 surface_tangent = " + Perturbation() + ";\n";
         // Mathematically, recomputing Z-component of the tangent vector won't affect the relevant
         // computation below, which is also confirmed on 3DS. So we don't bother recomputing here
         // even if 'renorm' is enabled.
-- 
cgit v1.2.3


From 46c6973d2bde25a2a8ae9ac434660798fd1dfaee Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Tue, 25 Jul 2017 22:30:29 +0300
Subject: pica/shader: extend UnitState for GS

Among four shader units in pica, a special unit can be configured to run both VS and GS program. GSUnitState represents this unit, which extends UnitState (which represents the other three normal units) with extra state for primitive emitting. It uses lots of raw pointers to represent internal structure in order to keep it standard layout type for JIT to access.
This unit doesn't handle triangle winding (inverting) itself; instead, it calls a WindingSetter handler. This will be explained in the following commits
---
 src/video_core/shader/shader.cpp | 38 +++++++++++++++++++++++++++++++++
 src/video_core/shader/shader.h   | 46 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)

(limited to 'src/video_core')

diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 67ed19ba8..b12468d3a 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -82,6 +82,44 @@ void UnitState::WriteOutput(const ShaderRegs& config, AttributeBuffer& output) {
     }
 }
 
+UnitState::UnitState(GSEmitter* emitter) : emitter_ptr(emitter) {}
+
+GSEmitter::GSEmitter() {
+    handlers = new Handlers;
+}
+
+GSEmitter::~GSEmitter() {
+    delete handlers;
+}
+
+void GSEmitter::Emit(Math::Vec4<float24> (&vertex)[16]) {
+    ASSERT(vertex_id < 3);
+    std::copy(std::begin(vertex), std::end(vertex), buffer[vertex_id].begin());
+    if (prim_emit) {
+        if (winding)
+            handlers->winding_setter();
+        for (size_t i = 0; i < buffer.size(); ++i) {
+            AttributeBuffer output;
+            unsigned int output_i = 0;
+            for (unsigned int reg : Common::BitSet<u32>(output_mask)) {
+                output.attr[output_i++] = buffer[i][reg];
+            }
+            handlers->vertex_handler(output);
+        }
+    }
+}
+
+GSUnitState::GSUnitState() : UnitState(&emitter) {}
+
+void GSUnitState::SetVertexHandler(VertexHandler vertex_handler, WindingSetter winding_setter) {
+    emitter.handlers->vertex_handler = std::move(vertex_handler);
+    emitter.handlers->winding_setter = std::move(winding_setter);
+}
+
+void GSUnitState::ConfigOutput(const ShaderRegs& config) {
+    emitter.output_mask = config.output_mask;
+}
+
 MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240));
 
 #ifdef ARCHITECTURE_x86_64
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index e156f6aef..caec96043 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -6,6 +6,7 @@
 
 #include <array>
 #include <cstddef>
+#include <functional>
 #include <type_traits>
 #include <nihstro/shader_bytecode.h>
 #include "common/assert.h"
@@ -31,6 +32,12 @@ struct AttributeBuffer {
     alignas(16) Math::Vec4<float24> attr[16];
 };
 
+/// Handler type for receiving vertex outputs from vertex shader or geometry shader
+using VertexHandler = std::function<void(const AttributeBuffer&)>;
+
+/// Handler type for signaling to invert the vertex order of the next triangle
+using WindingSetter = std::function<void()>;
+
 struct OutputVertex {
     Math::Vec4<float24> pos;
     Math::Vec4<float24> quat;
@@ -60,6 +67,29 @@ ASSERT_POS(tc2, RasterizerRegs::VSOutputAttributes::TEXCOORD2_U);
 static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
 static_assert(sizeof(OutputVertex) == 24 * sizeof(float), "OutputVertex has invalid size");
 
+/**
+ * This structure contains state information for primitive emitting in geometry shader.
+ */
+struct GSEmitter {
+    std::array<std::array<Math::Vec4<float24>, 16>, 3> buffer;
+    u8 vertex_id;
+    bool prim_emit;
+    bool winding;
+    u32 output_mask;
+
+    // Function objects are hidden behind a raw pointer to make the structure standard layout type,
+    // for JIT to use offsetof to access other members.
+    struct Handlers {
+        VertexHandler vertex_handler;
+        WindingSetter winding_setter;
+    } * handlers;
+
+    GSEmitter();
+    ~GSEmitter();
+    void Emit(Math::Vec4<float24> (&vertex)[16]);
+};
+static_assert(std::is_standard_layout<GSEmitter>::value, "GSEmitter is not standard layout type");
+
 /**
  * This structure contains the state information that needs to be unique for a shader unit. The 3DS
  * has four shader units that process shaders in parallel. At the present, Citra only implements a
@@ -67,6 +97,7 @@ static_assert(sizeof(OutputVertex) == 24 * sizeof(float), "OutputVertex has inva
  * here will make it easier for us to parallelize the shader processing later.
  */
 struct UnitState {
+    explicit UnitState(GSEmitter* emitter = nullptr);
     struct Registers {
         // The registers are accessed by the shader JIT using SSE instructions, and are therefore
         // required to be 16-byte aligned.
@@ -82,6 +113,8 @@ struct UnitState {
     // TODO: How many bits do these actually have?
     s32 address_registers[3];
 
+    GSEmitter* emitter_ptr;
+
     static size_t InputOffset(const SourceRegister& reg) {
         switch (reg.GetRegisterType()) {
         case RegisterType::Input:
@@ -125,6 +158,19 @@ struct UnitState {
     void WriteOutput(const ShaderRegs& config, AttributeBuffer& output);
 };
 
+/**
+ * This is an extended shader unit state that represents the special unit that can run both vertex
+ * shader and geometry shader. It contains an additional primitive emitter and utilities for
+ * geometry shader.
+ */
+struct GSUnitState : public UnitState {
+    GSUnitState();
+    void SetVertexHandler(VertexHandler vertex_handler, WindingSetter winding_setter);
+    void ConfigOutput(const ShaderRegs& config);
+
+    GSEmitter emitter;
+};
+
 struct ShaderSetup {
     struct {
         // The float uniforms are accessed by the shader JIT using SSE instructions, and are
-- 
cgit v1.2.3


From 28128348f21d83c30979ef10399a8a764bb08a73 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Tue, 25 Jul 2017 22:43:25 +0300
Subject: pica/shader/interpreter: implement SETEMIT and EMIT

---
 src/video_core/shader/shader_interpreter.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'src/video_core')

diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index 206c0978a..9d4da4904 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -636,6 +636,22 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
                 break;
             }
 
+            case OpCode::Id::EMIT: {
+                GSEmitter* emitter = state.emitter_ptr;
+                ASSERT_MSG(emitter, "Execute EMIT on VS");
+                emitter->Emit(state.registers.output);
+                break;
+            }
+
+            case OpCode::Id::SETEMIT: {
+                GSEmitter* emitter = state.emitter_ptr;
+                ASSERT_MSG(emitter, "Execute SETEMIT on VS");
+                emitter->vertex_id = instr.setemit.vertex_id;
+                emitter->prim_emit = instr.setemit.prim_emit != 0;
+                emitter->winding = instr.setemit.winding != 0;
+                break;
+            }
+
             default:
                 LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x",
                           (int)instr.opcode.Value().EffectiveOpCode(),
-- 
cgit v1.2.3


From bb63ae305279d9a73ea70133c89e92a36dc79f69 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Wed, 26 Jul 2017 00:39:43 +0300
Subject: correct constness

---
 src/video_core/shader/shader.cpp | 3 ++-
 src/video_core/shader/shader.h   | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index b12468d3a..e9063e616 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -21,7 +21,8 @@ namespace Pica {
 
 namespace Shader {
 
-OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs, AttributeBuffer& input) {
+OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs,
+                                               const AttributeBuffer& input) {
     // Setup output data
     union {
         OutputVertex ret{};
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index caec96043..a3789da01 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -50,7 +50,8 @@ struct OutputVertex {
     INSERT_PADDING_WORDS(1);
     Math::Vec2<float24> tc2;
 
-    static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs, AttributeBuffer& output);
+    static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs,
+                                            const AttributeBuffer& output);
 };
 #define ASSERT_POS(var, pos)                                                                       \
     static_assert(offsetof(OutputVertex, var) == pos * sizeof(float24), "Semantic at wrong "       \
-- 
cgit v1.2.3


From 36981a5aa6ffcc10417e533ab00de3b6f9bad067 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Wed, 26 Jul 2017 15:07:13 +0300
Subject: pica/primitive_assembly: Handle winding for GS primitive

hwtest shows that, although GS always emit a group of three vertices as one primitive, it still respects to the topology type, as if the three vertices are input into the primitive assembler independently and sequentially. It is also shown that the winding flag in SETEMIT only takes effect for Shader topology type, which is believed to be the actual difference between List and Shader (hence removed the TODO). However, only Shader topology type is observed in official games when GS is in use, so the other mode seems to be just unintended usage.
---
 src/video_core/primitive_assembly.cpp | 15 ++++++++++++---
 src/video_core/primitive_assembly.h   |  7 +++++++
 2 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp
index acd2ac5e2..9c3dd4cab 100644
--- a/src/video_core/primitive_assembly.cpp
+++ b/src/video_core/primitive_assembly.cpp
@@ -17,15 +17,18 @@ template <typename VertexType>
 void PrimitiveAssembler<VertexType>::SubmitVertex(const VertexType& vtx,
                                                   TriangleHandler triangle_handler) {
     switch (topology) {
-    // TODO: Figure out what's different with TriangleTopology::Shader.
     case PipelineRegs::TriangleTopology::List:
     case PipelineRegs::TriangleTopology::Shader:
         if (buffer_index < 2) {
             buffer[buffer_index++] = vtx;
         } else {
             buffer_index = 0;
-
-            triangle_handler(buffer[0], buffer[1], vtx);
+            if (topology == PipelineRegs::TriangleTopology::Shader && winding) {
+                triangle_handler(buffer[1], buffer[0], vtx);
+                winding = false;
+            } else {
+                triangle_handler(buffer[0], buffer[1], vtx);
+            }
         }
         break;
 
@@ -50,10 +53,16 @@ void PrimitiveAssembler<VertexType>::SubmitVertex(const VertexType& vtx,
     }
 }
 
+template <typename VertexType>
+void PrimitiveAssembler<VertexType>::SetWinding() {
+    winding = true;
+}
+
 template <typename VertexType>
 void PrimitiveAssembler<VertexType>::Reset() {
     buffer_index = 0;
     strip_ready = false;
+    winding = false;
 }
 
 template <typename VertexType>
diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h
index e8eccdf27..12de8e3b9 100644
--- a/src/video_core/primitive_assembly.h
+++ b/src/video_core/primitive_assembly.h
@@ -29,6 +29,12 @@ struct PrimitiveAssembler {
      */
     void SubmitVertex(const VertexType& vtx, TriangleHandler triangle_handler);
 
+    /**
+     * Invert the vertex order of the next triangle. Called by geometry shader emitter.
+     * This only takes effect for TriangleTopology::Shader.
+     */
+    void SetWinding();
+
     /**
      * Resets the internal state of the PrimitiveAssembler.
      */
@@ -45,6 +51,7 @@ private:
     int buffer_index;
     VertexType buffer[2];
     bool strip_ready = false;
+    bool winding = false;
 };
 
 } // namespace
-- 
cgit v1.2.3


From 8285ca4ad8f9a5d07c9a2ba91367fcf3756f5153 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Wed, 26 Jul 2017 18:44:52 +0300
Subject: pica/shader/jit: implement SETEMIT and EMIT

---
 src/video_core/shader/shader_jit_x64_compiler.cpp | 49 ++++++++++++++++++++++-
 src/video_core/shader/shader_jit_x64_compiler.h   |  2 +
 2 files changed, 49 insertions(+), 2 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp
index 42a57aab1..1b31623bd 100644
--- a/src/video_core/shader/shader_jit_x64_compiler.cpp
+++ b/src/video_core/shader/shader_jit_x64_compiler.cpp
@@ -75,8 +75,8 @@ const JitFunction instr_table[64] = {
     &JitShader::Compile_IF,    // ifu
     &JitShader::Compile_IF,    // ifc
     &JitShader::Compile_LOOP,  // loop
-    nullptr,                   // emit
-    nullptr,                   // sete
+    &JitShader::Compile_EMIT,  // emit
+    &JitShader::Compile_SETE,  // sete
     &JitShader::Compile_JMP,   // jmpc
     &JitShader::Compile_JMP,   // jmpu
     &JitShader::Compile_CMP,   // cmp
@@ -772,6 +772,51 @@ void JitShader::Compile_JMP(Instruction instr) {
     }
 }
 
+static void Emit(GSEmitter* emitter, Math::Vec4<float24> (*output)[16]) {
+    emitter->Emit(*output);
+}
+
+void JitShader::Compile_EMIT(Instruction instr) {
+    Label have_emitter, end;
+    mov(rax, qword[STATE + offsetof(UnitState, emitter_ptr)]);
+    test(rax, rax);
+    jnz(have_emitter);
+
+    ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    mov(ABI_PARAM1, reinterpret_cast<size_t>("Execute EMIT on VS"));
+    CallFarFunction(*this, LogCritical);
+    ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    jmp(end);
+
+    L(have_emitter);
+    ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    mov(ABI_PARAM1, rax);
+    mov(ABI_PARAM2, STATE);
+    add(ABI_PARAM2, static_cast<Xbyak::uint32>(offsetof(UnitState, registers.output)));
+    CallFarFunction(*this, Emit);
+    ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    L(end);
+}
+
+void JitShader::Compile_SETE(Instruction instr) {
+    Label have_emitter, end;
+    mov(rax, qword[STATE + offsetof(UnitState, emitter_ptr)]);
+    test(rax, rax);
+    jnz(have_emitter);
+
+    ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    mov(ABI_PARAM1, reinterpret_cast<size_t>("Execute SETEMIT on VS"));
+    CallFarFunction(*this, LogCritical);
+    ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    jmp(end);
+
+    L(have_emitter);
+    mov(byte[rax + offsetof(GSEmitter, vertex_id)], instr.setemit.vertex_id);
+    mov(byte[rax + offsetof(GSEmitter, prim_emit)], instr.setemit.prim_emit);
+    mov(byte[rax + offsetof(GSEmitter, winding)], instr.setemit.winding);
+    L(end);
+}
+
 void JitShader::Compile_Block(unsigned end) {
     while (program_counter < end) {
         Compile_NextInstr();
diff --git a/src/video_core/shader/shader_jit_x64_compiler.h b/src/video_core/shader/shader_jit_x64_compiler.h
index 31af0ca48..4aee56b1d 100644
--- a/src/video_core/shader/shader_jit_x64_compiler.h
+++ b/src/video_core/shader/shader_jit_x64_compiler.h
@@ -66,6 +66,8 @@ public:
     void Compile_JMP(Instruction instr);
     void Compile_CMP(Instruction instr);
     void Compile_MAD(Instruction instr);
+    void Compile_EMIT(Instruction instr);
+    void Compile_SETE(Instruction instr);
 
 private:
     void Compile_Block(unsigned end);
-- 
cgit v1.2.3


From 0f35755572fe63534813528de9a0710193f2e335 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Fri, 4 Aug 2017 17:03:17 +0300
Subject: pica/command_processor: build geometry pipeline and run geometry
 shader

The geometry pipeline manages data transfer between VS, GS and primitive assembler. It has known four modes:
 - no GS mode: sends VS output directly to the primitive assembler (what citra currently does)
 - GS mode 0: sends VS output to GS input registers, and sends GS output to primitive assembler
 - GS mode 1: sends VS output to GS uniform registers, and sends GS output to primitive assembler. It also takes an index from the index buffer at the beginning of each primitive for determine the primitive size.
 - GS mode 2: similar to mode 1, but doesn't take the index and uses a fixed primitive size.
hwtest shows that immediate mode also supports GS (at least for mode 0), so the geometry pipeline gets refactored into its own class for supporting both drawing mode.
In the immediate mode, some games don't set the pipeline registers to a valid value until the first attribute input, so a geometry pipeline reset flag is set in `pipeline.vs_default_attributes_setup.index` trigger, and the actual pipeline reconfigure is triggered in the first attribute input.
In the normal drawing mode with index buffer, the vertex cache is a little bit modified to support the geometry pipeline. Instead of OutputVertex, it now holds AttributeBuffer, which is the input to the geometry pipeline. The AttributeBuffer->OutputVertex conversion is done inside the pipeline vertex handler. The actual hardware vertex cache is believed to be implemented in a similar way (because this is the only way that makes sense).
Both geometry pipeline and GS unit rely on states preservation across drawing call, so they are put into the global state. In the future, the other three vertex shader units should be also placed in the global state, and a scheduler should be implemented on top of the four units. Note that the current gs_unit already allows running VS on it in the future.
---
 src/video_core/CMakeLists.txt        |   2 +
 src/video_core/command_processor.cpp |  54 +++----
 src/video_core/geometry_pipeline.cpp | 274 +++++++++++++++++++++++++++++++++++
 src/video_core/geometry_pipeline.h   |  49 +++++++
 src/video_core/pica.cpp              |  21 ++-
 src/video_core/pica_state.h          |  11 ++
 6 files changed, 383 insertions(+), 28 deletions(-)
 create mode 100644 src/video_core/geometry_pipeline.cpp
 create mode 100644 src/video_core/geometry_pipeline.h

(limited to 'src/video_core')

diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index cffa4c952..82f47d8a9 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(SRCS
             command_processor.cpp
             debug_utils/debug_utils.cpp
+            geometry_pipeline.cpp
             pica.cpp
             primitive_assembly.cpp
             regs.cpp
@@ -29,6 +30,7 @@ set(SRCS
 set(HEADERS
             command_processor.h
             debug_utils/debug_utils.h
+            geometry_pipeline.h
             gpu_debugger.h
             pica.h
             pica_state.h
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index f98ca3302..fb65a3a0a 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -161,6 +161,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
     case PICA_REG_INDEX(pipeline.vs_default_attributes_setup.index):
         g_state.immediate.current_attribute = 0;
+        g_state.immediate.reset_geometry_pipeline = true;
         default_attr_counter = 0;
         break;
 
@@ -234,16 +235,14 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                     shader_engine->Run(g_state.vs, shader_unit);
                     shader_unit.WriteOutput(regs.vs, output);
 
-                    // Send to renderer
-                    using Pica::Shader::OutputVertex;
-                    auto AddTriangle = [](const OutputVertex& v0, const OutputVertex& v1,
-                                          const OutputVertex& v2) {
-                        VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
-                    };
-
-                    g_state.primitive_assembler.SubmitVertex(
-                        Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, output),
-                        AddTriangle);
+                    // Send to geometry pipeline
+                    if (g_state.immediate.reset_geometry_pipeline) {
+                        g_state.geometry_pipeline.Reconfigure();
+                        g_state.immediate.reset_geometry_pipeline = false;
+                    }
+                    ASSERT(!g_state.geometry_pipeline.NeedIndexInput());
+                    g_state.geometry_pipeline.Setup(shader_engine);
+                    g_state.geometry_pipeline.SubmitVertex(output);
                 }
             }
         }
@@ -321,8 +320,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
         // The size has been tuned for optimal balance between hit-rate and the cost of lookup
         const size_t VERTEX_CACHE_SIZE = 32;
         std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids;
-        std::array<Shader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache;
-        Shader::OutputVertex output_vertex;
+        std::array<Shader::AttributeBuffer, VERTEX_CACHE_SIZE> vertex_cache;
+        Shader::AttributeBuffer vs_output;
 
         unsigned int vertex_cache_pos = 0;
         vertex_cache_ids.fill(-1);
@@ -332,6 +331,11 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
         shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
 
+        g_state.geometry_pipeline.Reconfigure();
+        g_state.geometry_pipeline.Setup(shader_engine);
+        if (g_state.geometry_pipeline.NeedIndexInput())
+            ASSERT(is_indexed);
+
         for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) {
             // Indexed rendering doesn't use the start offset
             unsigned int vertex =
@@ -345,6 +349,11 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
             bool vertex_cache_hit = false;
 
             if (is_indexed) {
+                if (g_state.geometry_pipeline.NeedIndexInput()) {
+                    g_state.geometry_pipeline.SubmitIndex(vertex);
+                    continue;
+                }
+
                 if (g_debug_context && Pica::g_debug_context->recorder) {
                     int size = index_u16 ? 2 : 1;
                     memory_accesses.AddAccess(base_address + index_info.offset + size * index,
@@ -353,7 +362,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
                 for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) {
                     if (vertex == vertex_cache_ids[i]) {
-                        output_vertex = vertex_cache[i];
+                        vs_output = vertex_cache[i];
                         vertex_cache_hit = true;
                         break;
                     }
@@ -362,7 +371,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
             if (!vertex_cache_hit) {
                 // Initialize data for the current vertex
-                Shader::AttributeBuffer input, output{};
+                Shader::AttributeBuffer input;
                 loader.LoadVertex(base_address, index, vertex, input, memory_accesses);
 
                 // Send to vertex shader
@@ -371,26 +380,17 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                                              (void*)&input);
                 shader_unit.LoadInput(regs.vs, input);
                 shader_engine->Run(g_state.vs, shader_unit);
-                shader_unit.WriteOutput(regs.vs, output);
-
-                // Retrieve vertex from register data
-                output_vertex = Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, output);
+                shader_unit.WriteOutput(regs.vs, vs_output);
 
                 if (is_indexed) {
-                    vertex_cache[vertex_cache_pos] = output_vertex;
+                    vertex_cache[vertex_cache_pos] = vs_output;
                     vertex_cache_ids[vertex_cache_pos] = vertex;
                     vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE;
                 }
             }
 
-            // Send to renderer
-            using Pica::Shader::OutputVertex;
-            auto AddTriangle = [](const OutputVertex& v0, const OutputVertex& v1,
-                                  const OutputVertex& v2) {
-                VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
-            };
-
-            primitive_assembler.SubmitVertex(output_vertex, AddTriangle);
+            // Send to geometry pipeline
+            g_state.geometry_pipeline.SubmitVertex(vs_output);
         }
 
         for (auto& range : memory_accesses.ranges) {
diff --git a/src/video_core/geometry_pipeline.cpp b/src/video_core/geometry_pipeline.cpp
new file mode 100644
index 000000000..b146e2ecb
--- /dev/null
+++ b/src/video_core/geometry_pipeline.cpp
@@ -0,0 +1,274 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "video_core/geometry_pipeline.h"
+#include "video_core/pica_state.h"
+#include "video_core/regs.h"
+#include "video_core/renderer_base.h"
+#include "video_core/video_core.h"
+
+namespace Pica {
+
+/// An attribute buffering interface for different pipeline modes
+class GeometryPipelineBackend {
+public:
+    virtual ~GeometryPipelineBackend() = default;
+
+    /// Checks if there is no incomplete data transfer
+    virtual bool IsEmpty() const = 0;
+
+    /// Checks if the pipeline needs a direct input from index buffer
+    virtual bool NeedIndexInput() const = 0;
+
+    /// Submits an index from index buffer
+    virtual void SubmitIndex(unsigned int val) = 0;
+
+    /**
+     * Submits vertex attributes
+     * @param input attributes of a vertex output from vertex shader
+     * @return if the buffer is full and the geometry shader should be invoked
+     */
+    virtual bool SubmitVertex(const Shader::AttributeBuffer& input) = 0;
+};
+
+// In the Point mode, vertex attributes are sent to the input registers in the geometry shader unit.
+// The size of vertex shader outputs and geometry shader inputs are constants. Geometry shader is
+// invoked upon inputs buffer filled up by vertex shader outputs. For example, if we have a geometry
+// shader that takes 6 inputs, and the vertex shader outputs 2 attributes, it would take 3 vertices
+// for one geometry shader invocation.
+// TODO: what happens when the input size is not divisible by the output size?
+class GeometryPipeline_Point : public GeometryPipelineBackend {
+public:
+    GeometryPipeline_Point(const Regs& regs, Shader::GSUnitState& unit) : regs(regs), unit(unit) {
+        ASSERT(regs.pipeline.variable_primitive == 0);
+        ASSERT(regs.gs.input_to_uniform == 0);
+        vs_output_num = regs.pipeline.vs_outmap_total_minus_1_a + 1;
+        size_t gs_input_num = regs.gs.max_input_attribute_index + 1;
+        ASSERT(gs_input_num % vs_output_num == 0);
+        buffer_cur = attribute_buffer.attr;
+        buffer_end = attribute_buffer.attr + gs_input_num;
+    }
+
+    bool IsEmpty() const override {
+        return buffer_cur == attribute_buffer.attr;
+    }
+
+    bool NeedIndexInput() const override {
+        return false;
+    }
+
+    void SubmitIndex(unsigned int val) override {
+        UNREACHABLE();
+    }
+
+    bool SubmitVertex(const Shader::AttributeBuffer& input) override {
+        buffer_cur = std::copy(input.attr, input.attr + vs_output_num, buffer_cur);
+        if (buffer_cur == buffer_end) {
+            buffer_cur = attribute_buffer.attr;
+            unit.LoadInput(regs.gs, attribute_buffer);
+            return true;
+        }
+        return false;
+    }
+
+private:
+    const Regs& regs;
+    Shader::GSUnitState& unit;
+    Shader::AttributeBuffer attribute_buffer;
+    Math::Vec4<float24>* buffer_cur;
+    Math::Vec4<float24>* buffer_end;
+    unsigned int vs_output_num;
+};
+
+// In VariablePrimitive mode, vertex attributes are buffered into the uniform registers in the
+// geometry shader unit. The number of vertex is variable, which is specified by the first index
+// value in the batch. This mode is usually used for subdivision.
+class GeometryPipeline_VariablePrimitive : public GeometryPipelineBackend {
+public:
+    GeometryPipeline_VariablePrimitive(const Regs& regs, Shader::ShaderSetup& setup)
+        : regs(regs), setup(setup) {
+        ASSERT(regs.pipeline.variable_primitive == 1);
+        ASSERT(regs.gs.input_to_uniform == 1);
+        vs_output_num = regs.pipeline.vs_outmap_total_minus_1_a + 1;
+    }
+
+    bool IsEmpty() const override {
+        return need_index;
+    }
+
+    bool NeedIndexInput() const override {
+        return need_index;
+    }
+
+    void SubmitIndex(unsigned int val) override {
+        DEBUG_ASSERT(need_index);
+
+        // The number of vertex input is put to the uniform register
+        float24 vertex_num = float24::FromFloat32(val);
+        setup.uniforms.f[0] = Math::MakeVec(vertex_num, vertex_num, vertex_num, vertex_num);
+
+        // The second uniform register and so on are used for receiving input vertices
+        buffer_cur = setup.uniforms.f + 1;
+
+        main_vertex_num = regs.pipeline.variable_vertex_main_num_minus_1 + 1;
+        total_vertex_num = val;
+        need_index = false;
+    }
+
+    bool SubmitVertex(const Shader::AttributeBuffer& input) override {
+        DEBUG_ASSERT(!need_index);
+        if (main_vertex_num != 0) {
+            // For main vertices, receive all attributes
+            buffer_cur = std::copy(input.attr, input.attr + vs_output_num, buffer_cur);
+            --main_vertex_num;
+        } else {
+            // For other vertices, only receive the first attribute (usually the position)
+            *(buffer_cur++) = input.attr[0];
+        }
+        --total_vertex_num;
+
+        if (total_vertex_num == 0) {
+            need_index = true;
+            return true;
+        }
+
+        return false;
+    }
+
+private:
+    bool need_index = true;
+    const Regs& regs;
+    Shader::ShaderSetup& setup;
+    unsigned int main_vertex_num;
+    unsigned int total_vertex_num;
+    Math::Vec4<float24>* buffer_cur;
+    unsigned int vs_output_num;
+};
+
+// In FixedPrimitive mode, vertex attributes are buffered into the uniform registers in the geometry
+// shader unit. The number of vertex per shader invocation is constant. This is usually used for
+// particle system.
+class GeometryPipeline_FixedPrimitive : public GeometryPipelineBackend {
+public:
+    GeometryPipeline_FixedPrimitive(const Regs& regs, Shader::ShaderSetup& setup)
+        : regs(regs), setup(setup) {
+        ASSERT(regs.pipeline.variable_primitive == 0);
+        ASSERT(regs.gs.input_to_uniform == 1);
+        vs_output_num = regs.pipeline.vs_outmap_total_minus_1_a + 1;
+        ASSERT(vs_output_num == regs.pipeline.gs_config.stride_minus_1 + 1);
+        size_t vertex_num = regs.pipeline.gs_config.fixed_vertex_num_minus_1 + 1;
+        buffer_cur = buffer_begin = setup.uniforms.f + regs.pipeline.gs_config.start_index;
+        buffer_end = buffer_begin + vs_output_num * vertex_num;
+    }
+
+    bool IsEmpty() const override {
+        return buffer_cur == buffer_begin;
+    }
+
+    bool NeedIndexInput() const override {
+        return false;
+    }
+
+    void SubmitIndex(unsigned int val) override {
+        UNREACHABLE();
+    }
+
+    bool SubmitVertex(const Shader::AttributeBuffer& input) override {
+        buffer_cur = std::copy(input.attr, input.attr + vs_output_num, buffer_cur);
+        if (buffer_cur == buffer_end) {
+            buffer_cur = buffer_begin;
+            return true;
+        }
+        return false;
+    }
+
+private:
+    const Regs& regs;
+    Shader::ShaderSetup& setup;
+    Math::Vec4<float24>* buffer_begin;
+    Math::Vec4<float24>* buffer_cur;
+    Math::Vec4<float24>* buffer_end;
+    unsigned int vs_output_num;
+};
+
+GeometryPipeline::GeometryPipeline(State& state) : state(state) {}
+
+GeometryPipeline::~GeometryPipeline() = default;
+
+void GeometryPipeline::SetVertexHandler(Shader::VertexHandler vertex_handler) {
+    this->vertex_handler = vertex_handler;
+}
+
+void GeometryPipeline::Setup(Shader::ShaderEngine* shader_engine) {
+    if (!backend)
+        return;
+
+    this->shader_engine = shader_engine;
+    shader_engine->SetupBatch(state.gs, state.regs.gs.main_offset);
+}
+
+void GeometryPipeline::Reconfigure() {
+    ASSERT(!backend || backend->IsEmpty());
+
+    if (state.regs.pipeline.use_gs == PipelineRegs::UseGS::No) {
+        backend = nullptr;
+        return;
+    }
+
+    ASSERT(state.regs.pipeline.use_gs == PipelineRegs::UseGS::Yes);
+
+    // The following assumes that when geometry shader is in use, the shader unit 3 is configured as
+    // a geometry shader unit.
+    // TODO: what happens if this is not true?
+    ASSERT(state.regs.pipeline.gs_unit_exclusive_configuration == 1);
+    ASSERT(state.regs.gs.shader_mode == ShaderRegs::ShaderMode::GS);
+
+    state.gs_unit.ConfigOutput(state.regs.gs);
+
+    ASSERT(state.regs.pipeline.vs_outmap_total_minus_1_a ==
+           state.regs.pipeline.vs_outmap_total_minus_1_b);
+
+    switch (state.regs.pipeline.gs_config.mode) {
+    case PipelineRegs::GSMode::Point:
+        backend = std::make_unique<GeometryPipeline_Point>(state.regs, state.gs_unit);
+        break;
+    case PipelineRegs::GSMode::VariablePrimitive:
+        backend = std::make_unique<GeometryPipeline_VariablePrimitive>(state.regs, state.gs);
+        break;
+    case PipelineRegs::GSMode::FixedPrimitive:
+        backend = std::make_unique<GeometryPipeline_FixedPrimitive>(state.regs, state.gs);
+        break;
+    default:
+        UNREACHABLE();
+    }
+}
+
+bool GeometryPipeline::NeedIndexInput() const {
+    if (!backend)
+        return false;
+    return backend->NeedIndexInput();
+}
+
+void GeometryPipeline::SubmitIndex(unsigned int val) {
+    backend->SubmitIndex(val);
+}
+
+void GeometryPipeline::SubmitVertex(const Shader::AttributeBuffer& input) {
+    if (!backend) {
+        // No backend means the geometry shader is disabled, so we send the vertex shader output
+        // directly to the primitive assembler.
+        vertex_handler(input);
+    } else {
+        if (backend->SubmitVertex(input)) {
+            shader_engine->Run(state.gs, state.gs_unit);
+
+            // The uniform b15 is set to true after every geometry shader invocation. This is useful
+            // for the shader to know if this is the first invocation in a batch, if the program set
+            // b15 to false first.
+            state.gs.uniforms.b[15] = true;
+        }
+    }
+}
+
+} // namespace Pica
diff --git a/src/video_core/geometry_pipeline.h b/src/video_core/geometry_pipeline.h
new file mode 100644
index 000000000..91fdd3192
--- /dev/null
+++ b/src/video_core/geometry_pipeline.h
@@ -0,0 +1,49 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include "video_core/shader/shader.h"
+
+namespace Pica {
+
+struct State;
+
+class GeometryPipelineBackend;
+
+/// A pipeline receiving from vertex shader and sending to geometry shader and primitive assembler
+class GeometryPipeline {
+public:
+    explicit GeometryPipeline(State& state);
+    ~GeometryPipeline();
+
+    /// Sets the handler for receiving vertex outputs from vertex shader
+    void SetVertexHandler(Shader::VertexHandler vertex_handler);
+
+    /**
+     * Setup the geometry shader unit if it is in use
+     * @param shader_engine the shader engine for the geometry shader to run
+     */
+    void Setup(Shader::ShaderEngine* shader_engine);
+
+    /// Reconfigures the pipeline according to current register settings
+    void Reconfigure();
+
+    /// Checks if the pipeline needs a direct input from index buffer
+    bool NeedIndexInput() const;
+
+    /// Submits an index from index buffer. Call this only when NeedIndexInput returns true
+    void SubmitIndex(unsigned int val);
+
+    /// Submits vertex attributes output from vertex shader
+    void SubmitVertex(const Shader::AttributeBuffer& input);
+
+private:
+    Shader::VertexHandler vertex_handler;
+    Shader::ShaderEngine* shader_engine;
+    std::unique_ptr<GeometryPipelineBackend> backend;
+    State& state;
+};
+} // namespace Pica
diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp
index b95148a6a..218e06883 100644
--- a/src/video_core/pica.cpp
+++ b/src/video_core/pica.cpp
@@ -3,9 +3,11 @@
 // Refer to the license.txt file included.
 
 #include <cstring>
+#include "video_core/geometry_pipeline.h"
 #include "video_core/pica.h"
 #include "video_core/pica_state.h"
-#include "video_core/regs_pipeline.h"
+#include "video_core/renderer_base.h"
+#include "video_core/video_core.h"
 
 namespace Pica {
 
@@ -24,6 +26,23 @@ void Zero(T& o) {
     memset(&o, 0, sizeof(o));
 }
 
+State::State() : geometry_pipeline(*this) {
+    auto SubmitVertex = [this](const Shader::AttributeBuffer& vertex) {
+        using Pica::Shader::OutputVertex;
+        auto AddTriangle = [this](const OutputVertex& v0, const OutputVertex& v1,
+                                  const OutputVertex& v2) {
+            VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
+        };
+        primitive_assembler.SubmitVertex(
+            Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, vertex), AddTriangle);
+    };
+
+    auto SetWinding = [this]() { primitive_assembler.SetWinding(); };
+
+    g_state.gs_unit.SetVertexHandler(SubmitVertex, SetWinding);
+    g_state.geometry_pipeline.SetVertexHandler(SubmitVertex);
+}
+
 void State::Reset() {
     Zero(regs);
     Zero(vs);
diff --git a/src/video_core/pica_state.h b/src/video_core/pica_state.h
index 864a2c9e6..c6634a0bc 100644
--- a/src/video_core/pica_state.h
+++ b/src/video_core/pica_state.h
@@ -8,6 +8,7 @@
 #include "common/bit_field.h"
 #include "common/common_types.h"
 #include "common/vector_math.h"
+#include "video_core/geometry_pipeline.h"
 #include "video_core/primitive_assembly.h"
 #include "video_core/regs.h"
 #include "video_core/shader/shader.h"
@@ -16,6 +17,7 @@ namespace Pica {
 
 /// Struct used to describe current Pica state
 struct State {
+    State();
     void Reset();
 
     /// Pica registers
@@ -137,8 +139,17 @@ struct State {
         Shader::AttributeBuffer input_vertex;
         // Index of the next attribute to be loaded into `input_vertex`.
         u32 current_attribute = 0;
+        // Indicates the immediate mode just started and the geometry pipeline needs to reconfigure
+        bool reset_geometry_pipeline = true;
     } immediate;
 
+    // the geometry shader needs to be kept in the global state because some shaders relie on
+    // preserved register value across shader invocation.
+    // TODO: also bring the three vertex shader units here and implement the shader scheduler.
+    Shader::GSUnitState gs_unit;
+
+    GeometryPipeline geometry_pipeline;
+
     // This is constructed with a dummy triangle topology
     PrimitiveAssembler<Shader::OutputVertex> primitive_assembler;
 };
-- 
cgit v1.2.3


From 1eca380886b5028e027f1380c04f221ac94ed47d Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Thu, 17 Aug 2017 10:46:59 +0300
Subject: gl_rasterizer: add clipping plane z<=0 defined in PICA

---
 src/video_core/renderer_opengl/gl_rasterizer.cpp |  3 +++
 src/video_core/renderer_opengl/gl_shader_gen.cpp |  2 ++
 src/video_core/renderer_opengl/gl_state.cpp      | 13 +++++++++++++
 src/video_core/renderer_opengl/gl_state.h        |  3 +++
 4 files changed, 21 insertions(+)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 1c6c15a58..aa95ef21d 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -28,6 +28,9 @@ MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(100, 100, 255));
 MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100));
 
 RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
+    // Clipping plane 0 is always enabled for PICA fixed clip plane z <= 0
+    state.clip_distance[0] = true;
+
     // Create sampler objects
     for (size_t i = 0; i < texture_samplers.size(); ++i) {
         texture_samplers[i].Create();
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index ae67aab05..0dae4b91e 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -1196,6 +1196,8 @@ void main() {
     normquat = vert_normquat;
     view = vert_view;
     gl_Position = vec4(vert_position.x, vert_position.y, -vert_position.z, vert_position.w);
+    gl_ClipDistance[0] = -vert_position.z; // fixed PICA clipping plane z <= 0
+    // TODO (wwylele): calculate gl_ClipDistance[1] from user-defined clipping plane
 }
 )";
 
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index bc9d34b84..06a905766 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -68,6 +68,8 @@ OpenGLState::OpenGLState() {
     draw.vertex_buffer = 0;
     draw.uniform_buffer = 0;
     draw.shader_program = 0;
+
+    clip_distance = {};
 }
 
 void OpenGLState::Apply() const {
@@ -261,6 +263,17 @@ void OpenGLState::Apply() const {
         glUseProgram(draw.shader_program);
     }
 
+    // Clip distance
+    for (size_t i = 0; i < clip_distance.size(); ++i) {
+        if (clip_distance[i] != cur_state.clip_distance[i]) {
+            if (clip_distance[i]) {
+                glEnable(GL_CLIP_DISTANCE0 + i);
+            } else {
+                glDisable(GL_CLIP_DISTANCE0 + i);
+            }
+        }
+    }
+
     cur_state = *this;
 }
 
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 745a74479..437fe34c4 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <array>
 #include <glad/glad.h>
 
 namespace TextureUnits {
@@ -123,6 +124,8 @@ public:
         GLuint shader_program;   // GL_CURRENT_PROGRAM
     } draw;
 
+    std::array<bool, 2> clip_distance; // GL_CLIP_DISTANCE
+
     OpenGLState();
 
     /// Get the currently active OpenGL state
-- 
cgit v1.2.3


From 5a4af616c67a4d7968c71b419795777c3601341b Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Thu, 17 Aug 2017 10:56:15 +0300
Subject: gl_shader_gen: simplify and clarify the depth transformation between
 vertex shader and fragment shader

---
 src/video_core/renderer_opengl/gl_shader_gen.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 0dae4b91e..015e69da9 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -1112,7 +1112,10 @@ vec4 secondary_fragment_color = vec4(0.0);
                "gl_FragCoord.y < scissor_y2)) discard;\n";
     }
 
-    out += "float z_over_w = 1.0 - gl_FragCoord.z * 2.0;\n";
+    // After perspective divide, OpenGL transform z_over_w from [-1, 1] to [near, far]. Here we use
+    // default near = 0 and far = 1, and undo the transformation to get the original z_over_w, then
+    // do our own transformation according to PICA specification.
+    out += "float z_over_w = 2.0 * gl_FragCoord.z - 1.0;\n";
     out += "float depth = z_over_w * depth_scale + depth_offset;\n";
     if (state.depthmap_enable == RasterizerRegs::DepthBuffering::WBuffering) {
         out += "depth /= gl_FragCoord.w;\n";
@@ -1195,7 +1198,7 @@ void main() {
     texcoord0_w = vert_texcoord0_w;
     normquat = vert_normquat;
     view = vert_view;
-    gl_Position = vec4(vert_position.x, vert_position.y, -vert_position.z, vert_position.w);
+    gl_Position = vert_position;
     gl_ClipDistance[0] = -vert_position.z; // fixed PICA clipping plane z <= 0
     // TODO (wwylele): calculate gl_ClipDistance[1] from user-defined clipping plane
 }
-- 
cgit v1.2.3


From 72b26ac32f74457d017e4eb96d83e2a66e713a5a Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Thu, 17 Aug 2017 10:57:31 +0300
Subject: swrasterizer/clipper: remove tested TODO

hwtested. Current implementation is the correct behavior
---
 src/video_core/swrasterizer/clipper.cpp | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/clipper.cpp b/src/video_core/swrasterizer/clipper.cpp
index 7537689b7..cdbc71502 100644
--- a/src/video_core/swrasterizer/clipper.cpp
+++ b/src/video_core/swrasterizer/clipper.cpp
@@ -125,10 +125,6 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
         {Math::MakeVec(f0, f0, f0, -f1), Math::Vec4<float24>(f0, f0, f0, EPSILON)}, // w = EPSILON
     }};
 
-    // TODO: If one vertex lies outside one of the depth clipping planes, some platforms (e.g. Wii)
-    //       drop the whole primitive instead of clipping the primitive properly. We should test if
-    //       this happens on the 3DS, too.
-
     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
     // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
     for (auto edge : clipping_edges) {
-- 
cgit v1.2.3


From 63b6e802cdffc1464b4a1fe3f5171d71146e8e9a Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Thu, 17 Aug 2017 11:02:19 +0300
Subject: swrasterizer: remove invalid TODO

This function is called in clipping, before the pespective divide, and is not used in later rasterization. Thus it doesn't need perspective correction.
---
 src/video_core/swrasterizer/rasterizer.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/rasterizer.h b/src/video_core/swrasterizer/rasterizer.h
index 2f0877581..66cd6cfd4 100644
--- a/src/video_core/swrasterizer/rasterizer.h
+++ b/src/video_core/swrasterizer/rasterizer.h
@@ -19,10 +19,9 @@ struct Vertex : Shader::OutputVertex {
 
     // Linear interpolation
     // factor: 0=this, 1=vtx
+    // Note: This function cannot be called after perspective divide
     void Lerp(float24 factor, const Vertex& vtx) {
         pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor);
-
-        // TODO: Should perform perspective correct interpolation here...
         quat = quat * factor + vtx.quat * (float24::FromFloat32(1) - factor);
         color = color * factor + vtx.color * (float24::FromFloat32(1) - factor);
         tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor);
@@ -30,12 +29,11 @@ struct Vertex : Shader::OutputVertex {
         tc0_w = tc0_w * factor + vtx.tc0_w * (float24::FromFloat32(1) - factor);
         view = view * factor + vtx.view * (float24::FromFloat32(1) - factor);
         tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor);
-
-        screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor);
     }
 
     // Linear interpolation
     // factor: 0=v0, 1=v1
+    // Note: This function cannot be called after perspective divide
     static Vertex Lerp(float24 factor, const Vertex& v0, const Vertex& v1) {
         Vertex ret = v0;
         ret.Lerp(factor, v1);
-- 
cgit v1.2.3


From 3e478ca13110639a67ad95880aae5d7d13e096b7 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Fri, 18 Aug 2017 15:04:56 +0300
Subject: SwRasterizer/Lighting: implement bump mapping

---
 src/video_core/swrasterizer/lighting.cpp   | 28 +++++++++++++++++++++++-----
 src/video_core/swrasterizer/lighting.h     |  3 ++-
 src/video_core/swrasterizer/rasterizer.cpp |  4 ++--
 3 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/lighting.cpp b/src/video_core/swrasterizer/lighting.cpp
index 39a3e396d..4f16bac07 100644
--- a/src/video_core/swrasterizer/lighting.cpp
+++ b/src/video_core/swrasterizer/lighting.cpp
@@ -22,14 +22,32 @@ static float LookupLightingLut(const Pica::State::Lighting& lighting, size_t lut
 
 std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
     const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state,
-    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view) {
+    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view,
+    const Math::Vec4<u8> (&texture_color)[4]) {
 
-    // TODO(Subv): Bump mapping
-    Math::Vec3<float> surface_normal = {0.0f, 0.0f, 1.0f};
+    Math::Vec3<float> surface_normal;
+    Math::Vec3<float> surface_tangent;
 
     if (lighting.config0.bump_mode != LightingRegs::LightingBumpMode::None) {
-        LOG_CRITICAL(HW_GPU, "unimplemented bump mapping");
-        UNIMPLEMENTED();
+        Math::Vec3<float> perturbation =
+            texture_color[lighting.config0.bump_selector].xyz().Cast<float>() / 127.5f -
+            Math::MakeVec(1.0f, 1.0f, 1.0f);
+        if (lighting.config0.bump_mode == LightingRegs::LightingBumpMode::NormalMap) {
+            if (!lighting.config0.disable_bump_renorm) {
+                const float z_square = 1 - perturbation.xy().Length2();
+                perturbation.z = std::sqrt(std::max(z_square, 0.0f));
+            }
+            surface_normal = perturbation;
+            surface_tangent = Math::MakeVec(1.0f, 0.0f, 0.0f);
+        } else if (lighting.config0.bump_mode == LightingRegs::LightingBumpMode::TangentMap) {
+            surface_normal = Math::MakeVec(0.0f, 0.0f, 1.0f);
+            surface_tangent = perturbation;
+        } else {
+            LOG_ERROR(HW_GPU, "Unknown bump mode %u", lighting.config0.bump_mode.Value());
+        }
+    } else {
+        surface_normal = Math::MakeVec(0.0f, 0.0f, 1.0f);
+        surface_tangent = Math::MakeVec(1.0f, 0.0f, 0.0f);
     }
 
     // Use the normalized the quaternion when performing the rotation
diff --git a/src/video_core/swrasterizer/lighting.h b/src/video_core/swrasterizer/lighting.h
index 438dca926..d807a3d94 100644
--- a/src/video_core/swrasterizer/lighting.h
+++ b/src/video_core/swrasterizer/lighting.h
@@ -13,6 +13,7 @@ namespace Pica {
 
 std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
     const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state,
-    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view);
+    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view,
+    const Math::Vec4<u8> (&texture_color)[4]);
 
 } // namespace Pica
diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index fdc1df199..862135614 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -437,8 +437,8 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
                     GetInterpolatedAttribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(),
                     GetInterpolatedAttribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(),
                 };
-                std::tie(primary_fragment_color, secondary_fragment_color) =
-                    ComputeFragmentsColors(g_state.regs.lighting, g_state.lighting, normquat, view);
+                std::tie(primary_fragment_color, secondary_fragment_color) = ComputeFragmentsColors(
+                    g_state.regs.lighting, g_state.lighting, normquat, view, texture_color);
             }
 
             for (unsigned tev_stage_index = 0; tev_stage_index < tev_stages.size();
-- 
cgit v1.2.3


From b5aa5703540adceb1fc867b577dad50388a47e15 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Fri, 18 Aug 2017 16:35:11 +0300
Subject: SwRasterizer/Lighting: implement LUT input CP

---
 src/video_core/swrasterizer/lighting.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/lighting.cpp b/src/video_core/swrasterizer/lighting.cpp
index 4f16bac07..b38964530 100644
--- a/src/video_core/swrasterizer/lighting.cpp
+++ b/src/video_core/swrasterizer/lighting.cpp
@@ -52,6 +52,7 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
 
     // Use the normalized the quaternion when performing the rotation
     auto normal = Math::QuaternionRotate(normquat, surface_normal);
+    auto tangent = Math::QuaternionRotate(normquat, surface_tangent);
 
     Math::Vec4<float> diffuse_sum = {0.0f, 0.0f, 0.0f, 1.0f};
     Math::Vec4<float> specular_sum = {0.0f, 0.0f, 0.0f, 1.0f};
@@ -120,6 +121,16 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
                 result = Math::Dot(light_vector, spot_dir.Cast<float>() / 2047.0f);
                 break;
             }
+            case LightingRegs::LightingLutInput::CP:
+                if (lighting.config0.config == LightingRegs::LightingConfig::Config7) {
+                    const Math::Vec3<float> norm_half_vector = half_vector.Normalized();
+                    const Math::Vec3<float> half_vector_proj =
+                        norm_half_vector - normal * Math::Dot(normal, norm_half_vector);
+                    result = Math::Dot(half_vector_proj, tangent);
+                } else {
+                    result = 0.0f;
+                }
+                break;
             default:
                 LOG_CRITICAL(HW_GPU, "Unknown lighting LUT input %u\n", static_cast<u32>(input));
                 UNIMPLEMENTED();
-- 
cgit v1.2.3


From 17c6104d2afda7bf354c454f87561a3dbdf524e3 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Mon, 21 Aug 2017 12:03:38 +0300
Subject: gl_rasterizer/lighting: more accurate CP formula

---
 src/video_core/renderer_opengl/gl_shader_gen.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index ae67aab05..d85f281e5 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -594,8 +594,8 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
                 // Note: even if the normal vector is modified by normal map, which is not the
                 // normal of the tangent plane anymore, the half angle vector is still projected
                 // using the modified normal vector.
-                std::string half_angle_proj = "normalize(half_vector) - normal / dot(normal, "
-                                              "normal) * dot(normal, normalize(half_vector))";
+                std::string half_angle_proj =
+                    "normalize(half_vector) - normal * dot(normal, normalize(half_vector))";
                 // Note: the half angle vector projection is confirmed not normalized before the dot
                 // product. The result is in fact not cos(phi) as the name suggested.
                 index = "dot(" + half_angle_proj + ", tangent)";
-- 
cgit v1.2.3


From ea51a3af261254e5455f63a0ef41e55ef1dfc471 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Tue, 22 Aug 2017 09:49:26 +0300
Subject: SwRasterizer: implement custom clip plane

---
 src/video_core/regs_rasterizer.h        | 14 ++++++++++++--
 src/video_core/swrasterizer/clipper.cpp | 15 +++++++++++++--
 2 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/regs_rasterizer.h b/src/video_core/regs_rasterizer.h
index 2874fd127..4fef00d76 100644
--- a/src/video_core/regs_rasterizer.h
+++ b/src/video_core/regs_rasterizer.h
@@ -5,10 +5,10 @@
 #pragma once
 
 #include <array>
-
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "video_core/pica_types.h"
 
 namespace Pica {
 
@@ -31,7 +31,17 @@ struct RasterizerRegs {
 
     BitField<0, 24, u32> viewport_size_y;
 
-    INSERT_PADDING_WORDS(0x9);
+    INSERT_PADDING_WORDS(0x3);
+
+    BitField<0, 1, u32> clip_enable;
+    BitField<0, 24, u32> clip_coef[4]; // float24
+
+    Math::Vec4<float24> GetClipCoef() const {
+        return {float24::FromRaw(clip_coef[0]), float24::FromRaw(clip_coef[1]),
+                float24::FromRaw(clip_coef[2]), float24::FromRaw(clip_coef[3])};
+    }
+
+    INSERT_PADDING_WORDS(0x1);
 
     BitField<0, 24, u32> viewport_depth_range;      // float24
     BitField<0, 24, u32> viewport_depth_near_plane; // float24
diff --git a/src/video_core/swrasterizer/clipper.cpp b/src/video_core/swrasterizer/clipper.cpp
index cdbc71502..cc76ba555 100644
--- a/src/video_core/swrasterizer/clipper.cpp
+++ b/src/video_core/swrasterizer/clipper.cpp
@@ -127,8 +127,7 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
 
     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
     // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
-    for (auto edge : clipping_edges) {
-
+    auto Clip = [&](const ClippingEdge& edge) {
         std::swap(input_list, output_list);
         output_list->clear();
 
@@ -147,12 +146,24 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
             }
             reference_vertex = &vertex;
         }
+    };
+
+    for (auto edge : clipping_edges) {
+        Clip(edge);
 
         // Need to have at least a full triangle to continue...
         if (output_list->size() < 3)
             return;
     }
 
+    if (g_state.regs.rasterizer.clip_enable) {
+        ClippingEdge custom_edge{-g_state.regs.rasterizer.GetClipCoef()};
+        Clip(custom_edge);
+
+        if (output_list->size() < 3)
+            return;
+    }
+
     InitScreenCoordinates((*output_list)[0]);
     InitScreenCoordinates((*output_list)[1]);
 
-- 
cgit v1.2.3


From addbcd5784c8195f49cecc20834537c80d1c8c72 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Tue, 22 Aug 2017 09:49:53 +0300
Subject: gl_rasterizer: implement custom clip plane

---
 src/video_core/renderer_opengl/gl_rasterizer.cpp | 28 +++++++++
 src/video_core/renderer_opengl/gl_rasterizer.h   |  9 ++-
 src/video_core/renderer_opengl/gl_shader_gen.cpp | 80 ++++++++++++++----------
 3 files changed, 83 insertions(+), 34 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index aa95ef21d..7b0cd1b66 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -169,6 +169,8 @@ RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
     glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA32F, proctex_diff_lut_buffer.handle);
 
     // Sync fixed function OpenGL state
+    SyncClipEnabled();
+    SyncClipCoef();
     SyncCullMode();
     SyncBlendEnabled();
     SyncBlendFuncs();
@@ -401,6 +403,18 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
         SyncCullMode();
         break;
 
+    // Clipping plane
+    case PICA_REG_INDEX(rasterizer.clip_enable):
+        SyncClipEnabled();
+        break;
+
+    case PICA_REG_INDEX_WORKAROUND(rasterizer.clip_coef[0], 0x48):
+    case PICA_REG_INDEX_WORKAROUND(rasterizer.clip_coef[1], 0x49):
+    case PICA_REG_INDEX_WORKAROUND(rasterizer.clip_coef[2], 0x4a):
+    case PICA_REG_INDEX_WORKAROUND(rasterizer.clip_coef[3], 0x4b):
+        SyncClipCoef();
+        break;
+
     // Depth modifiers
     case PICA_REG_INDEX(rasterizer.viewport_depth_range):
         SyncDepthScale();
@@ -1280,6 +1294,20 @@ void RasterizerOpenGL::SetShader() {
     }
 }
 
+void RasterizerOpenGL::SyncClipEnabled() {
+    state.clip_distance[1] = Pica::g_state.regs.rasterizer.clip_enable != 0;
+}
+
+void RasterizerOpenGL::SyncClipCoef() {
+    const auto raw_clip_coef = Pica::g_state.regs.rasterizer.GetClipCoef();
+    const GLvec4 new_clip_coef = {raw_clip_coef.x.ToFloat32(), raw_clip_coef.y.ToFloat32(),
+                                  raw_clip_coef.z.ToFloat32(), raw_clip_coef.w.ToFloat32()};
+    if (new_clip_coef != uniform_block_data.data.clip_coef) {
+        uniform_block_data.data.clip_coef = new_clip_coef;
+        uniform_block_data.dirty = true;
+    }
+}
+
 void RasterizerOpenGL::SyncCullMode() {
     const auto& regs = Pica::g_state.regs;
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 78e218efe..46c62961c 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -151,14 +151,21 @@ private:
         LightSrc light_src[8];
         alignas(16) GLvec4 const_color[6]; // A vec4 color for each of the six tev stages
         alignas(16) GLvec4 tev_combiner_buffer_color;
+        alignas(16) GLvec4 clip_coef;
     };
 
     static_assert(
-        sizeof(UniformData) == 0x460,
+        sizeof(UniformData) == 0x470,
         "The size of the UniformData structure has changed, update the structure in the shader");
     static_assert(sizeof(UniformData) < 16384,
                   "UniformData structure must be less than 16kb as per the OpenGL spec");
 
+    /// Syncs the clip enabled status to match the PICA register
+    void SyncClipEnabled();
+
+    /// Syncs the clip coefficients to match the PICA register
+    void SyncClipCoef();
+
     /// Sets the OpenGL shader in accordance with the current PICA register state
     void SetShader();
 
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 015e69da9..aa60b2e7f 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -24,6 +24,42 @@ using TevStageConfig = TexturingRegs::TevStageConfig;
 
 namespace GLShader {
 
+static const std::string UniformBlockDef = R"(
+#define NUM_TEV_STAGES 6
+#define NUM_LIGHTS 8
+
+struct LightSrc {
+    vec3 specular_0;
+    vec3 specular_1;
+    vec3 diffuse;
+    vec3 ambient;
+    vec3 position;
+    vec3 spot_direction;
+    float dist_atten_bias;
+    float dist_atten_scale;
+};
+
+layout (std140) uniform shader_data {
+    vec2 framebuffer_scale;
+    int alphatest_ref;
+    float depth_scale;
+    float depth_offset;
+    int scissor_x1;
+    int scissor_y1;
+    int scissor_x2;
+    int scissor_y2;
+    vec3 fog_color;
+    vec2 proctex_noise_f;
+    vec2 proctex_noise_a;
+    vec2 proctex_noise_p;
+    vec3 lighting_global_ambient;
+    LightSrc light_src[NUM_LIGHTS];
+    vec4 const_color[NUM_TEV_STAGES];
+    vec4 tev_combiner_buffer_color;
+    vec4 clip_coef;
+};
+)";
+
 PicaShaderConfig PicaShaderConfig::BuildFromRegs(const Pica::Regs& regs) {
     PicaShaderConfig res;
 
@@ -1008,8 +1044,6 @@ std::string GenerateFragmentShader(const PicaShaderConfig& config) {
 
     std::string out = R"(
 #version 330 core
-#define NUM_TEV_STAGES 6
-#define NUM_LIGHTS 8
 
 in vec4 primary_color;
 in vec2 texcoord[3];
@@ -1021,36 +1055,6 @@ in vec4 gl_FragCoord;
 
 out vec4 color;
 
-struct LightSrc {
-    vec3 specular_0;
-    vec3 specular_1;
-    vec3 diffuse;
-    vec3 ambient;
-    vec3 position;
-    vec3 spot_direction;
-    float dist_atten_bias;
-    float dist_atten_scale;
-};
-
-layout (std140) uniform shader_data {
-    vec2 framebuffer_scale;
-    int alphatest_ref;
-    float depth_scale;
-    float depth_offset;
-    int scissor_x1;
-    int scissor_y1;
-    int scissor_x2;
-    int scissor_y2;
-    vec3 fog_color;
-    vec2 proctex_noise_f;
-    vec2 proctex_noise_a;
-    vec2 proctex_noise_p;
-    vec3 lighting_global_ambient;
-    LightSrc light_src[NUM_LIGHTS];
-    vec4 const_color[NUM_TEV_STAGES];
-    vec4 tev_combiner_buffer_color;
-};
-
 uniform sampler2D tex[3];
 uniform samplerBuffer lighting_lut;
 uniform samplerBuffer fog_lut;
@@ -1059,7 +1063,11 @@ uniform samplerBuffer proctex_color_map;
 uniform samplerBuffer proctex_alpha_map;
 uniform samplerBuffer proctex_lut;
 uniform samplerBuffer proctex_diff_lut;
+)";
+
+    out += UniformBlockDef;
 
+    out += R"(
 // Rotate the vector v by the quaternion q
 vec3 quaternion_rotate(vec4 q, vec3 v) {
     return v + 2.0 * cross(q.xyz, cross(q.xyz, v) + q.w * v);
@@ -1190,6 +1198,12 @@ out float texcoord0_w;
 out vec4 normquat;
 out vec3 view;
 
+)";
+
+    out += UniformBlockDef;
+
+    out += R"(
+
 void main() {
     primary_color = vert_color;
     texcoord[0] = vert_texcoord0;
@@ -1200,7 +1214,7 @@ void main() {
     view = vert_view;
     gl_Position = vert_position;
     gl_ClipDistance[0] = -vert_position.z; // fixed PICA clipping plane z <= 0
-    // TODO (wwylele): calculate gl_ClipDistance[1] from user-defined clipping plane
+    gl_ClipDistance[1] = dot(clip_coef, vert_position);
 }
 )";
 
-- 
cgit v1.2.3


From 417cb45e3fc20a7529ce5d548ba0fbc36ea0a621 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Tue, 22 Aug 2017 09:47:15 +0300
Subject: SwRasterizer/Clipper: flip the sign convention to match PICA and
 OpenGL

---
 src/video_core/swrasterizer/clipper.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/clipper.cpp b/src/video_core/swrasterizer/clipper.cpp
index cc76ba555..a52129eb7 100644
--- a/src/video_core/swrasterizer/clipper.cpp
+++ b/src/video_core/swrasterizer/clipper.cpp
@@ -31,7 +31,7 @@ public:
         : coeffs(coeffs), bias(bias) {}
 
     bool IsInside(const Vertex& vertex) const {
-        return Math::Dot(vertex.pos + bias, coeffs) <= float24::FromFloat32(0);
+        return Math::Dot(vertex.pos + bias, coeffs) >= float24::FromFloat32(0);
     }
 
     bool IsOutSide(const Vertex& vertex) const {
@@ -116,13 +116,13 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
     static const float24 f0 = float24::FromFloat32(0.0);
     static const float24 f1 = float24::FromFloat32(1.0);
     static const std::array<ClippingEdge, 7> clipping_edges = {{
-        {Math::MakeVec(f1, f0, f0, -f1)},                                           // x = +w
-        {Math::MakeVec(-f1, f0, f0, -f1)},                                          // x = -w
-        {Math::MakeVec(f0, f1, f0, -f1)},                                           // y = +w
-        {Math::MakeVec(f0, -f1, f0, -f1)},                                          // y = -w
-        {Math::MakeVec(f0, f0, f1, f0)},                                            // z =  0
-        {Math::MakeVec(f0, f0, -f1, -f1)},                                          // z = -w
-        {Math::MakeVec(f0, f0, f0, -f1), Math::Vec4<float24>(f0, f0, f0, EPSILON)}, // w = EPSILON
+        {Math::MakeVec(-f1, f0, f0, f1)},                                          // x = +w
+        {Math::MakeVec(f1, f0, f0, f1)},                                           // x = -w
+        {Math::MakeVec(f0, -f1, f0, f1)},                                          // y = +w
+        {Math::MakeVec(f0, f1, f0, f1)},                                           // y = -w
+        {Math::MakeVec(f0, f0, -f1, f0)},                                          // z =  0
+        {Math::MakeVec(f0, f0, f1, f1)},                                           // z = -w
+        {Math::MakeVec(f0, f0, f0, f1), Math::Vec4<float24>(f0, f0, f0, EPSILON)}, // w = EPSILON
     }};
 
     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
@@ -157,7 +157,7 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
     }
 
     if (g_state.regs.rasterizer.clip_enable) {
-        ClippingEdge custom_edge{-g_state.regs.rasterizer.GetClipCoef()};
+        ClippingEdge custom_edge{g_state.regs.rasterizer.GetClipCoef()};
         Clip(custom_edge);
 
         if (output_list->size() < 3)
-- 
cgit v1.2.3


From da88f3b8f0f9f1162b7ad41f70e2126195eee999 Mon Sep 17 00:00:00 2001
From: Subv <subv2112@gmail.com>
Date: Mon, 21 Aug 2017 12:18:52 -0500
Subject: Warnings: Fixed a few missing-return warnings in video_core.

---
 src/video_core/regs_framebuffer.h           | 10 ++++------
 src/video_core/swrasterizer/framebuffer.cpp |  2 ++
 src/video_core/swrasterizer/texturing.cpp   |  4 ++++
 3 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/regs_framebuffer.h b/src/video_core/regs_framebuffer.h
index a50bd4111..7b565f911 100644
--- a/src/video_core/regs_framebuffer.h
+++ b/src/video_core/regs_framebuffer.h
@@ -256,10 +256,9 @@ struct FramebufferRegs {
             return 3;
         case DepthFormat::D24S8:
             return 4;
-        default:
-            LOG_CRITICAL(HW_GPU, "Unknown depth format %u", format);
-            UNIMPLEMENTED();
         }
+
+        ASSERT_MSG(false, "Unknown depth format %u", format);
     }
 
     // Returns the number of bits per depth component of the specified depth format
@@ -270,10 +269,9 @@ struct FramebufferRegs {
         case DepthFormat::D24:
         case DepthFormat::D24S8:
             return 24;
-        default:
-            LOG_CRITICAL(HW_GPU, "Unknown depth format %u", format);
-            UNIMPLEMENTED();
         }
+
+        ASSERT_MSG(false, "Unknown depth format %u", format);
     }
 
     INSERT_PADDING_WORDS(0x20);
diff --git a/src/video_core/swrasterizer/framebuffer.cpp b/src/video_core/swrasterizer/framebuffer.cpp
index 7de3aac75..f34eab6cf 100644
--- a/src/video_core/swrasterizer/framebuffer.cpp
+++ b/src/video_core/swrasterizer/framebuffer.cpp
@@ -352,6 +352,8 @@ u8 LogicOp(u8 src, u8 dest, FramebufferRegs::LogicOp op) {
     case FramebufferRegs::LogicOp::OrInverted:
         return ~src | dest;
     }
+
+    UNREACHABLE();
 };
 
 } // namespace Rasterizer
diff --git a/src/video_core/swrasterizer/texturing.cpp b/src/video_core/swrasterizer/texturing.cpp
index 4f02b93f2..79b1ce841 100644
--- a/src/video_core/swrasterizer/texturing.cpp
+++ b/src/video_core/swrasterizer/texturing.cpp
@@ -89,6 +89,8 @@ Math::Vec3<u8> GetColorModifier(TevStageConfig::ColorModifier factor,
     case ColorModifier::OneMinusSourceBlue:
         return (Math::Vec3<u8>(255, 255, 255) - values.bbb()).Cast<u8>();
     }
+
+    UNREACHABLE();
 };
 
 u8 GetAlphaModifier(TevStageConfig::AlphaModifier factor, const Math::Vec4<u8>& values) {
@@ -119,6 +121,8 @@ u8 GetAlphaModifier(TevStageConfig::AlphaModifier factor, const Math::Vec4<u8>&
     case AlphaModifier::OneMinusSourceBlue:
         return 255 - values.b();
     }
+
+    UNREACHABLE();
 };
 
 Math::Vec3<u8> ColorCombine(TevStageConfig::Operation op, const Math::Vec3<u8> input[3]) {
-- 
cgit v1.2.3


From e2c41a589198ff3162da8047a4c33162b02b0f2b Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Thu, 31 Aug 2017 12:24:00 +0300
Subject: video_core: report telemetry for gas mode

---
 src/video_core/renderer_opengl/gl_shader_gen.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 3f390491a..c8fc7a0ff 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -8,6 +8,7 @@
 #include "common/assert.h"
 #include "common/bit_field.h"
 #include "common/logging/log.h"
+#include "core/core.h"
 #include "video_core/regs_framebuffer.h"
 #include "video_core/regs_lighting.h"
 #include "video_core/regs_rasterizer.h"
@@ -1155,6 +1156,11 @@ vec4 secondary_fragment_color = vec4(0.0);
 
         // Blend the fog
         out += "last_tex_env_out.rgb = mix(fog_color.rgb, last_tex_env_out.rgb, fog_factor);\n";
+    } else if (state.fog_mode == TexturingRegs::FogMode::Gas) {
+        Core::Telemetry().AddField(Telemetry::FieldType::Session, "VideoCore_Pica_UseGasMode",
+                                   true);
+        LOG_CRITICAL(Render_OpenGL, "Unimplemented gas mode");
+        UNIMPLEMENTED();
     }
 
     out += "gl_FragDepth = depth;\n";
-- 
cgit v1.2.3


From 12fbc8c8dff3265b03cffdd5bb5e6dd6537cd824 Mon Sep 17 00:00:00 2001
From: wwylele <wwylele@gmail.com>
Date: Sun, 27 Aug 2017 07:33:27 +0300
Subject: pica/lighting: only apply Fresnel factor for the last light

---
 src/video_core/renderer_opengl/gl_shader_gen.cpp | 9 +++++----
 src/video_core/swrasterizer/lighting.cpp         | 7 ++++---
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 3f390491a..b5f359da6 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -750,7 +750,8 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
         }
 
         // Fresnel
-        if (lighting.lut_fr.enable &&
+        // Note: only the last entry in the light slots applies the Fresnel factor
+        if (light_index == lighting.src_num - 1 && lighting.lut_fr.enable &&
             LightingRegs::IsLightingSamplerSupported(lighting.config,
                                                      LightingRegs::LightingSampler::Fresnel)) {
             // Lookup fresnel LUT value
@@ -759,17 +760,17 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
                             lighting.lut_fr.type, lighting.lut_fr.abs_input);
             value = "(" + std::to_string(lighting.lut_fr.scale) + " * " + value + ")";
 
-            // Enabled for difffuse lighting alpha component
+            // Enabled for diffuse lighting alpha component
             if (lighting.fresnel_selector == LightingRegs::LightingFresnelSelector::PrimaryAlpha ||
                 lighting.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
-                out += "diffuse_sum.a  *= " + value + ";\n";
+                out += "diffuse_sum.a = " + value + ";\n";
             }
 
             // Enabled for the specular lighting alpha component
             if (lighting.fresnel_selector ==
                     LightingRegs::LightingFresnelSelector::SecondaryAlpha ||
                 lighting.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
-                out += "specular_sum.a *= " + value + ";\n";
+                out += "specular_sum.a = " + value + ";\n";
             }
         }
 
diff --git a/src/video_core/swrasterizer/lighting.cpp b/src/video_core/swrasterizer/lighting.cpp
index b38964530..5fa748611 100644
--- a/src/video_core/swrasterizer/lighting.cpp
+++ b/src/video_core/swrasterizer/lighting.cpp
@@ -230,7 +230,8 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
             d1_lut_value * refl_value * light_config.specular_1.ToVec3f();
 
         // Fresnel
-        if (lighting.config1.disable_lut_fr == 0 &&
+        // Note: only the last entry in the light slots applies the Fresnel factor
+        if (light_index == lighting.max_light_index && lighting.config1.disable_lut_fr == 0 &&
             LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
                                                      LightingRegs::LightingSampler::Fresnel)) {
 
@@ -242,14 +243,14 @@ std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
             if (lighting.config0.fresnel_selector ==
                     LightingRegs::LightingFresnelSelector::PrimaryAlpha ||
                 lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
-                diffuse_sum.a() *= lut_value;
+                diffuse_sum.a() = lut_value;
             }
 
             // Enabled for the specular lighting alpha component
             if (lighting.config0.fresnel_selector ==
                     LightingRegs::LightingFresnelSelector::SecondaryAlpha ||
                 lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
-                specular_sum.a() *= lut_value;
+                specular_sum.a() = lut_value;
             }
         }
 
-- 
cgit v1.2.3


From ad0b57f4071fb7ec9da764b3905e0bb5e4c5eef2 Mon Sep 17 00:00:00 2001
From: James Rowe <jroweboy@gmail.com>
Date: Thu, 7 Sep 2017 22:05:42 -0600
Subject: GPU: Add draw for immediate and batch modes

PR #1461 introduced a regression where some games would change configuration
even while in the poorly named "drawing" mode, which broke the heuristic
citra was using to determine when to draw the batch. This change adds
back in a draw call for batching, and also adds in a draw call in
immediate mode each time it adds a triangle.
---
 src/video_core/command_processor.cpp | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index fb65a3a0a..fff159058 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -243,6 +243,15 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                     ASSERT(!g_state.geometry_pipeline.NeedIndexInput());
                     g_state.geometry_pipeline.Setup(shader_engine);
                     g_state.geometry_pipeline.SubmitVertex(output);
+
+                    // TODO: If drawing after every immediate mode triangle kills performance,
+                    // change it to flush triangles whenever a draing config register changes
+                    // See: https://github.com/citra-emu/citra/pull/2866#issuecomment-327011550
+                    VideoCore::g_renderer->Rasterizer()->DrawTriangles();
+                    if (g_debug_context) {
+                        g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch,
+                                                 nullptr);
+                    }
                 }
             }
         }
@@ -398,6 +407,12 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                                                       range.second, range.first);
         }
 
+        MICROPROFILE_SCOPE(GPU_Drawing);
+        VideoCore::g_renderer->Rasterizer()->DrawTriangles();
+        if (g_debug_context) {
+            g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
+        }
+
         break;
     }
 
@@ -632,6 +647,6 @@ void ProcessCommandList(const u32* list, u32 size) {
     }
 }
 
-} // namespace
+} // namespace CommandProcessor
 
-} // namespace
+} // namespace Pica
-- 
cgit v1.2.3


From 6a110ac5f55502aa1330cc4dd09d11a4eb502e1b Mon Sep 17 00:00:00 2001
From: Huw Pascoe <huw.pascoe@gmail.com>
Date: Sat, 16 Sep 2017 04:30:35 +0100
Subject: Fixed framebuffer warning

---
 src/video_core/renderer_opengl/gl_rasterizer.cpp | 25 +++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 7b0cd1b66..7e09e4712 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -237,13 +237,24 @@ void RasterizerOpenGL::DrawTriangles() {
 
     glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D,
                            color_surface != nullptr ? color_surface->texture.handle : 0, 0);
-    glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D,
-                           depth_surface != nullptr ? depth_surface->texture.handle : 0, 0);
-    bool has_stencil =
-        regs.framebuffer.framebuffer.depth_format == Pica::FramebufferRegs::DepthFormat::D24S8;
-    glFramebufferTexture2D(
-        GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
-        (has_stencil && depth_surface != nullptr) ? depth_surface->texture.handle : 0, 0);
+    if (depth_surface != nullptr) {
+        if (regs.framebuffer.framebuffer.depth_format ==
+            Pica::FramebufferRegs::DepthFormat::D24S8) {
+            // attach both depth and stencil
+            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
+                                   depth_surface->texture.handle, 0);
+        } else {
+            // attach depth
+            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D,
+                                   depth_surface->texture.handle, 0);
+            // clear stencil attachment
+            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
+        }
+    } else {
+        // clear both depth and stencil attachment
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
+                               0);
+    }
 
     // Sync the viewport
     // These registers hold half-width and half-height, so must be multiplied by 2
-- 
cgit v1.2.3


From a234e4c2009b08039d0698cbbcc8595a1f04a615 Mon Sep 17 00:00:00 2001
From: Huw Pascoe <huw.pascoe@gmail.com>
Date: Sun, 17 Sep 2017 15:42:45 +0100
Subject: Improved performance of FromAttributeBuffer

Ternary operator is optimized by the compiler
whereas std::min() is meant to return a value.

I've noticed a 5%-10% emulation speed increase.
---
 src/video_core/shader/shader.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'src/video_core')

diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index e9063e616..2857d2829 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -52,7 +52,8 @@ OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs,
     // The hardware takes the absolute and saturates vertex colors like this, *before* doing
     // interpolation
     for (unsigned i = 0; i < 4; ++i) {
-        ret.color[i] = float24::FromFloat32(std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f));
+        float c = std::fabs(ret.color[i].ToFloat32());
+        ret.color[i] = float24::FromFloat32(c < 1.0f ? c : 1.0f);
     }
 
     LOG_TRACE(HW_GPU, "Output vertex: pos(%.2f, %.2f, %.2f, %.2f), quat(%.2f, %.2f, %.2f, %.2f), "
-- 
cgit v1.2.3


From 19d41dcc6e6892125f1123b34db3dc284f04b744 Mon Sep 17 00:00:00 2001
From: James Rowe <jroweboy@gmail.com>
Date: Sat, 23 Sep 2017 09:28:20 -0600
Subject: Remove pipeline.gpu_mode and fix minor issues

---
 src/video_core/command_processor.cpp | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index fff159058..3ab4af374 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -245,7 +245,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                     g_state.geometry_pipeline.SubmitVertex(output);
 
                     // TODO: If drawing after every immediate mode triangle kills performance,
-                    // change it to flush triangles whenever a draing config register changes
+                    // change it to flush triangles whenever a drawing config register changes
                     // See: https://github.com/citra-emu/citra/pull/2866#issuecomment-327011550
                     VideoCore::g_renderer->Rasterizer()->DrawTriangles();
                     if (g_debug_context) {
@@ -259,16 +259,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     }
 
     case PICA_REG_INDEX(pipeline.gpu_mode):
-        if (regs.pipeline.gpu_mode == PipelineRegs::GPUMode::Configuring) {
-            MICROPROFILE_SCOPE(GPU_Drawing);
-
-            // Draw immediate mode triangles when GPU Mode is set to GPUMode::Configuring
-            VideoCore::g_renderer->Rasterizer()->DrawTriangles();
-
-            if (g_debug_context) {
-                g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
-            }
-        }
+        // This register likely just enables vertex processing and doesn't need any special handling
         break;
 
     case PICA_REG_INDEX_WORKAROUND(pipeline.command_buffer.trigger[0], 0x23c):
@@ -407,7 +398,6 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                                                       range.second, range.first);
         }
 
-        MICROPROFILE_SCOPE(GPU_Drawing);
         VideoCore::g_renderer->Rasterizer()->DrawTriangles();
         if (g_debug_context) {
             g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
-- 
cgit v1.2.3


From 876aa82c29d2e17f8b5a4f74155971cba78c00b6 Mon Sep 17 00:00:00 2001
From: Huw Pascoe <huw.pascoe@gmail.com>
Date: Sun, 24 Sep 2017 22:24:45 +0100
Subject: Optimized Morton

---
 src/video_core/utils.h | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/utils.h b/src/video_core/utils.h
index 7ce83a055..d8567f314 100644
--- a/src/video_core/utils.h
+++ b/src/video_core/utils.h
@@ -8,17 +8,11 @@
 
 namespace VideoCore {
 
-/**
- * Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
- * arranged in a Z-order curve. More details on the bit manipulation at:
- * https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
- */
+// 8x8 Z-Order coordinate from 2D coordinates
 static inline u32 MortonInterleave(u32 x, u32 y) {
-    u32 i = (x & 7) | ((y & 7) << 8); // ---- -210
-    i = (i ^ (i << 2)) & 0x1313;      // ---2 --10
-    i = (i ^ (i << 1)) & 0x1515;      // ---2 -1-0
-    i = (i | (i >> 7)) & 0x3F;
-    return i;
+    static const u32 xlut[] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15};
+    static const u32 ylut[] = {0x00, 0x02, 0x08, 0x0a, 0x20, 0x22, 0x28, 0x2a};
+    return xlut[x % 8] + ylut[y % 8];
 }
 
 /**
-- 
cgit v1.2.3


From 903906da3b9b274836510adcabf8adf8e2c15954 Mon Sep 17 00:00:00 2001
From: Huw Pascoe <huw.pascoe@gmail.com>
Date: Fri, 22 Sep 2017 15:37:42 +0100
Subject: Optimized Float<M,E> multiplication

Before:

ucomiss xmm1, xmm1
jp      .L9
pxor    xmm2, xmm2
mov     edx, 1
ucomiss xmm0, xmm2
setp    al
cmovne  eax, edx
test    al, al
jne     .L9
.L3:
movaps  xmm0, xmm2
ret
.L9:
ucomiss xmm0, xmm0
jp      .L10
pxor    xmm2, xmm2
mov     edx, 1
ucomiss xmm1, xmm2
setp    al
cmovne  eax, edx
test    al, al
je      .L3

After:

movaps  xmm2, xmm1
mulss   xmm2, xmm0
ucomiss xmm2, xmm2
jnp     .L3
ucomiss xmm1, xmm0
jnp     .L11
.L3:
movaps  xmm0, xmm2
ret
.L11:
pxor    xmm2, xmm2
jmp     .L3
---
 src/video_core/pica_types.h | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/pica_types.h b/src/video_core/pica_types.h
index 5d7e10066..2eafa7e9e 100644
--- a/src/video_core/pica_types.h
+++ b/src/video_core/pica_types.h
@@ -58,11 +58,12 @@ public:
     }
 
     Float<M, E> operator*(const Float<M, E>& flt) const {
-        if ((this->value == 0.f && !std::isnan(flt.value)) ||
-            (flt.value == 0.f && !std::isnan(this->value)))
-            // PICA gives 0 instead of NaN when multiplying by inf
-            return Zero();
-        return Float<M, E>::FromFloat32(ToFloat32() * flt.ToFloat32());
+        float result = value * flt.ToFloat32();
+        // PICA gives 0 instead of NaN when multiplying by inf
+        if (!std::isnan(value) && !std::isnan(flt.ToFloat32()))
+            if (std::isnan(result))
+                result = 0.f;
+        return Float<M, E>::FromFloat32(result);
     }
 
     Float<M, E> operator/(const Float<M, E>& flt) const {
@@ -78,12 +79,7 @@ public:
     }
 
     Float<M, E>& operator*=(const Float<M, E>& flt) {
-        if ((this->value == 0.f && !std::isnan(flt.value)) ||
-            (flt.value == 0.f && !std::isnan(this->value)))
-            // PICA gives 0 instead of NaN when multiplying by inf
-            *this = Zero();
-        else
-            value *= flt.ToFloat32();
+        value = operator*(flt).value;
         return *this;
     }
 
-- 
cgit v1.2.3


From a321bce37834c1f3034bd87df14fc71c13e6b84a Mon Sep 17 00:00:00 2001
From: Subv <subv2112@gmail.com>
Date: Tue, 29 Aug 2017 12:59:54 -0500
Subject: Disable unary operator- on Math::Vec2/Vec3/Vec4 for unsigned types.

It is unlikely we will ever use this without first doing a Cast to a signed type.
Fixes 9 "unary minus operator applied to unsigned type, result still unsigned" warnings on MSVC2017.3
---
 src/video_core/swrasterizer/clipper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/video_core')

diff --git a/src/video_core/swrasterizer/clipper.cpp b/src/video_core/swrasterizer/clipper.cpp
index a52129eb7..c1ed48398 100644
--- a/src/video_core/swrasterizer/clipper.cpp
+++ b/src/video_core/swrasterizer/clipper.cpp
@@ -98,7 +98,7 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
 
     auto FlipQuaternionIfOpposite = [](auto& a, const auto& b) {
         if (Math::Dot(a, b) < float24::Zero())
-            a = -a;
+            a = a * float24::FromFloat32(-1.0f);
     };
 
     // Flip the quaternions if they are opposite to prevent interpolating them over the wrong
-- 
cgit v1.2.3


From a13ab958cbba75bc9abd1ca50f3030a10a75784e Mon Sep 17 00:00:00 2001
From: Huw Pascoe <huw.pascoe@gmail.com>
Date: Wed, 27 Sep 2017 00:26:09 +0100
Subject: Fixed type conversion ambiguity

---
 src/video_core/geometry_pipeline.cpp        | 2 +-
 src/video_core/renderer_opengl/gl_state.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/geometry_pipeline.cpp b/src/video_core/geometry_pipeline.cpp
index b146e2ecb..98ff2ccd3 100644
--- a/src/video_core/geometry_pipeline.cpp
+++ b/src/video_core/geometry_pipeline.cpp
@@ -105,7 +105,7 @@ public:
         DEBUG_ASSERT(need_index);
 
         // The number of vertex input is put to the uniform register
-        float24 vertex_num = float24::FromFloat32(val);
+        float24 vertex_num = float24::FromFloat32(static_cast<float>(val));
         setup.uniforms.f[0] = Math::MakeVec(vertex_num, vertex_num, vertex_num, vertex_num);
 
         // The second uniform register and so on are used for receiving input vertices
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 06a905766..5770ae08f 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -267,9 +267,9 @@ void OpenGLState::Apply() const {
     for (size_t i = 0; i < clip_distance.size(); ++i) {
         if (clip_distance[i] != cur_state.clip_distance[i]) {
             if (clip_distance[i]) {
-                glEnable(GL_CLIP_DISTANCE0 + i);
+                glEnable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i));
             } else {
-                glDisable(GL_CLIP_DISTANCE0 + i);
+                glDisable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i));
             }
         }
     }
-- 
cgit v1.2.3


From b3b34a1e76664c412fd7b37b3529cadd3983acfb Mon Sep 17 00:00:00 2001
From: Huw Pascoe <huw.pascoe@gmail.com>
Date: Tue, 3 Oct 2017 12:21:37 +0100
Subject: Extracted the attribute setup and draw commands into their own
 functions

---
 src/video_core/command_processor.cpp | 439 ++++++++++++++++++-----------------
 1 file changed, 222 insertions(+), 217 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 3ab4af374..caf9f7a06 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -119,6 +119,224 @@ static void WriteUniformFloatReg(ShaderRegs& config, Shader::ShaderSetup& setup,
     }
 }
 
+static void LoadDefaultVertexAttributes(u32 register_value) {
+    auto& regs = g_state.regs;
+
+    // TODO: Does actual hardware indeed keep an intermediate buffer or does
+    //       it directly write the values?
+    default_attr_write_buffer[default_attr_counter++] = register_value;
+
+    // Default attributes are written in a packed format such that four float24 values are encoded
+    // in three 32-bit numbers.
+    // We write to internal memory once a full such vector is written.
+    if (default_attr_counter >= 3) {
+        default_attr_counter = 0;
+
+        auto& setup = regs.pipeline.vs_default_attributes_setup;
+
+        if (setup.index >= 16) {
+            LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index);
+            return;
+        }
+
+        Math::Vec4<float24> attribute;
+
+        // NOTE: The destination component order indeed is "backwards"
+        attribute.w = float24::FromRaw(default_attr_write_buffer[0] >> 8);
+        attribute.z = float24::FromRaw(((default_attr_write_buffer[0] & 0xFF) << 16) |
+                                       ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
+        attribute.y = float24::FromRaw(((default_attr_write_buffer[1] & 0xFFFF) << 8) |
+                                       ((default_attr_write_buffer[2] >> 24) & 0xFF));
+        attribute.x = float24::FromRaw(default_attr_write_buffer[2] & 0xFFFFFF);
+
+        LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
+                  attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
+                  attribute.w.ToFloat32());
+
+        // TODO: Verify that this actually modifies the register!
+        if (setup.index < 15) {
+            g_state.input_default_attributes.attr[setup.index] = attribute;
+            setup.index++;
+        } else {
+            // Put each attribute into an immediate input buffer.  When all specified immediate
+            // attributes are present, the Vertex Shader is invoked and everything is sent to
+            // the primitive assembler.
+
+            auto& immediate_input = g_state.immediate.input_vertex;
+            auto& immediate_attribute_id = g_state.immediate.current_attribute;
+
+            immediate_input.attr[immediate_attribute_id] = attribute;
+
+            if (immediate_attribute_id < regs.pipeline.max_input_attrib_index) {
+                immediate_attribute_id += 1;
+            } else {
+                MICROPROFILE_SCOPE(GPU_Drawing);
+                immediate_attribute_id = 0;
+
+                auto* shader_engine = Shader::GetEngine();
+                shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
+
+                // Send to vertex shader
+                if (g_debug_context)
+                    g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
+                                             static_cast<void*>(&immediate_input));
+                Shader::UnitState shader_unit;
+                Shader::AttributeBuffer output{};
+
+                shader_unit.LoadInput(regs.vs, immediate_input);
+                shader_engine->Run(g_state.vs, shader_unit);
+                shader_unit.WriteOutput(regs.vs, output);
+
+                // Send to geometry pipeline
+                if (g_state.immediate.reset_geometry_pipeline) {
+                    g_state.geometry_pipeline.Reconfigure();
+                    g_state.immediate.reset_geometry_pipeline = false;
+                }
+                ASSERT(!g_state.geometry_pipeline.NeedIndexInput());
+                g_state.geometry_pipeline.Setup(shader_engine);
+                g_state.geometry_pipeline.SubmitVertex(output);
+
+                // TODO: If drawing after every immediate mode triangle kills performance,
+                // change it to flush triangles whenever a drawing config register changes
+                // See: https://github.com/citra-emu/citra/pull/2866#issuecomment-327011550
+                VideoCore::g_renderer->Rasterizer()->DrawTriangles();
+                if (g_debug_context) {
+                    g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
+                }
+            }
+        }
+    }
+}
+
+static void Draw(u32 command_id) {
+    MICROPROFILE_SCOPE(GPU_Drawing);
+    auto& regs = g_state.regs;
+
+#if PICA_LOG_TEV
+    DebugUtils::DumpTevStageConfig(regs.GetTevStages());
+#endif
+    if (g_debug_context)
+        g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr);
+
+    // Processes information about internal vertex attributes to figure out how a vertex is
+    // loaded.
+    // Later, these can be compiled and cached.
+    const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress();
+    VertexLoader loader(regs.pipeline);
+
+    // Load vertices
+    bool is_indexed = (command_id == PICA_REG_INDEX(pipeline.trigger_draw_indexed));
+
+    const auto& index_info = regs.pipeline.index_array;
+    const u8* index_address_8 = Memory::GetPhysicalPointer(base_address + index_info.offset);
+    const u16* index_address_16 = reinterpret_cast<const u16*>(index_address_8);
+    bool index_u16 = index_info.format != 0;
+
+    PrimitiveAssembler<Shader::OutputVertex>& primitive_assembler = g_state.primitive_assembler;
+
+    if (g_debug_context && g_debug_context->recorder) {
+        for (int i = 0; i < 3; ++i) {
+            const auto texture = regs.texturing.GetTextures()[i];
+            if (!texture.enabled)
+                continue;
+
+            u8* texture_data = Memory::GetPhysicalPointer(texture.config.GetPhysicalAddress());
+            g_debug_context->recorder->MemoryAccessed(
+                texture_data, Pica::TexturingRegs::NibblesPerPixel(texture.format) *
+                                  texture.config.width / 2 * texture.config.height,
+                texture.config.GetPhysicalAddress());
+        }
+    }
+
+    DebugUtils::MemoryAccessTracker memory_accesses;
+
+    // Simple circular-replacement vertex cache
+    // The size has been tuned for optimal balance between hit-rate and the cost of lookup
+    const size_t VERTEX_CACHE_SIZE = 32;
+    std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids;
+    std::array<Shader::AttributeBuffer, VERTEX_CACHE_SIZE> vertex_cache;
+    Shader::AttributeBuffer vs_output;
+
+    unsigned int vertex_cache_pos = 0;
+    vertex_cache_ids.fill(-1);
+
+    auto* shader_engine = Shader::GetEngine();
+    Shader::UnitState shader_unit;
+
+    shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
+
+    g_state.geometry_pipeline.Reconfigure();
+    g_state.geometry_pipeline.Setup(shader_engine);
+    if (g_state.geometry_pipeline.NeedIndexInput())
+        ASSERT(is_indexed);
+
+    for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) {
+        // Indexed rendering doesn't use the start offset
+        unsigned int vertex = is_indexed
+                                  ? (index_u16 ? index_address_16[index] : index_address_8[index])
+                                  : (index + regs.pipeline.vertex_offset);
+
+        // -1 is a common special value used for primitive restart. Since it's unknown if
+        // the PICA supports it, and it would mess up the caching, guard against it here.
+        ASSERT(vertex != -1);
+
+        bool vertex_cache_hit = false;
+
+        if (is_indexed) {
+            if (g_state.geometry_pipeline.NeedIndexInput()) {
+                g_state.geometry_pipeline.SubmitIndex(vertex);
+                continue;
+            }
+
+            if (g_debug_context && Pica::g_debug_context->recorder) {
+                int size = index_u16 ? 2 : 1;
+                memory_accesses.AddAccess(base_address + index_info.offset + size * index, size);
+            }
+
+            for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) {
+                if (vertex == vertex_cache_ids[i]) {
+                    vs_output = vertex_cache[i];
+                    vertex_cache_hit = true;
+                    break;
+                }
+            }
+        }
+
+        if (!vertex_cache_hit) {
+            // Initialize data for the current vertex
+            Shader::AttributeBuffer input;
+            loader.LoadVertex(base_address, index, vertex, input, memory_accesses);
+
+            // Send to vertex shader
+            if (g_debug_context)
+                g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
+                                         (void*)&input);
+            shader_unit.LoadInput(regs.vs, input);
+            shader_engine->Run(g_state.vs, shader_unit);
+            shader_unit.WriteOutput(regs.vs, vs_output);
+
+            if (is_indexed) {
+                vertex_cache[vertex_cache_pos] = vs_output;
+                vertex_cache_ids[vertex_cache_pos] = vertex;
+                vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE;
+            }
+        }
+
+        // Send to geometry pipeline
+        g_state.geometry_pipeline.SubmitVertex(vs_output);
+    }
+
+    for (auto& range : memory_accesses.ranges) {
+        g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first),
+                                                  range.second, range.first);
+    }
+
+    VideoCore::g_renderer->Rasterizer()->DrawTriangles();
+    if (g_debug_context) {
+        g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
+    }
+}
+
 static void WritePicaReg(u32 id, u32 value, u32 mask) {
     auto& regs = g_state.regs;
 
@@ -168,95 +386,9 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     // Load default vertex input attributes
     case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[0], 0x233):
     case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[1], 0x234):
-    case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[2], 0x235): {
-        // TODO: Does actual hardware indeed keep an intermediate buffer or does
-        //       it directly write the values?
-        default_attr_write_buffer[default_attr_counter++] = value;
-
-        // Default attributes are written in a packed format such that four float24 values are
-        // encoded in
-        // three 32-bit numbers. We write to internal memory once a full such vector is
-        // written.
-        if (default_attr_counter >= 3) {
-            default_attr_counter = 0;
-
-            auto& setup = regs.pipeline.vs_default_attributes_setup;
-
-            if (setup.index >= 16) {
-                LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index);
-                break;
-            }
-
-            Math::Vec4<float24> attribute;
-
-            // NOTE: The destination component order indeed is "backwards"
-            attribute.w = float24::FromRaw(default_attr_write_buffer[0] >> 8);
-            attribute.z = float24::FromRaw(((default_attr_write_buffer[0] & 0xFF) << 16) |
-                                           ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
-            attribute.y = float24::FromRaw(((default_attr_write_buffer[1] & 0xFFFF) << 8) |
-                                           ((default_attr_write_buffer[2] >> 24) & 0xFF));
-            attribute.x = float24::FromRaw(default_attr_write_buffer[2] & 0xFFFFFF);
-
-            LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
-                      attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
-                      attribute.w.ToFloat32());
-
-            // TODO: Verify that this actually modifies the register!
-            if (setup.index < 15) {
-                g_state.input_default_attributes.attr[setup.index] = attribute;
-                setup.index++;
-            } else {
-                // Put each attribute into an immediate input buffer.  When all specified immediate
-                // attributes are present, the Vertex Shader is invoked and everything is sent to
-                // the primitive assembler.
-
-                auto& immediate_input = g_state.immediate.input_vertex;
-                auto& immediate_attribute_id = g_state.immediate.current_attribute;
-
-                immediate_input.attr[immediate_attribute_id] = attribute;
-
-                if (immediate_attribute_id < regs.pipeline.max_input_attrib_index) {
-                    immediate_attribute_id += 1;
-                } else {
-                    MICROPROFILE_SCOPE(GPU_Drawing);
-                    immediate_attribute_id = 0;
-
-                    auto* shader_engine = Shader::GetEngine();
-                    shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
-
-                    // Send to vertex shader
-                    if (g_debug_context)
-                        g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
-                                                 static_cast<void*>(&immediate_input));
-                    Shader::UnitState shader_unit;
-                    Shader::AttributeBuffer output{};
-
-                    shader_unit.LoadInput(regs.vs, immediate_input);
-                    shader_engine->Run(g_state.vs, shader_unit);
-                    shader_unit.WriteOutput(regs.vs, output);
-
-                    // Send to geometry pipeline
-                    if (g_state.immediate.reset_geometry_pipeline) {
-                        g_state.geometry_pipeline.Reconfigure();
-                        g_state.immediate.reset_geometry_pipeline = false;
-                    }
-                    ASSERT(!g_state.geometry_pipeline.NeedIndexInput());
-                    g_state.geometry_pipeline.Setup(shader_engine);
-                    g_state.geometry_pipeline.SubmitVertex(output);
-
-                    // TODO: If drawing after every immediate mode triangle kills performance,
-                    // change it to flush triangles whenever a drawing config register changes
-                    // See: https://github.com/citra-emu/citra/pull/2866#issuecomment-327011550
-                    VideoCore::g_renderer->Rasterizer()->DrawTriangles();
-                    if (g_debug_context) {
-                        g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch,
-                                                 nullptr);
-                    }
-                }
-            }
-        }
+    case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[2], 0x235):
+        LoadDefaultVertexAttributes(value);
         break;
-    }
 
     case PICA_REG_INDEX(pipeline.gpu_mode):
         // This register likely just enables vertex processing and doesn't need any special handling
@@ -275,136 +407,9 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
     // It seems like these trigger vertex rendering
     case PICA_REG_INDEX(pipeline.trigger_draw):
-    case PICA_REG_INDEX(pipeline.trigger_draw_indexed): {
-        MICROPROFILE_SCOPE(GPU_Drawing);
-
-#if PICA_LOG_TEV
-        DebugUtils::DumpTevStageConfig(regs.GetTevStages());
-#endif
-        if (g_debug_context)
-            g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr);
-
-        // Processes information about internal vertex attributes to figure out how a vertex is
-        // loaded.
-        // Later, these can be compiled and cached.
-        const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress();
-        VertexLoader loader(regs.pipeline);
-
-        // Load vertices
-        bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed));
-
-        const auto& index_info = regs.pipeline.index_array;
-        const u8* index_address_8 = Memory::GetPhysicalPointer(base_address + index_info.offset);
-        const u16* index_address_16 = reinterpret_cast<const u16*>(index_address_8);
-        bool index_u16 = index_info.format != 0;
-
-        PrimitiveAssembler<Shader::OutputVertex>& primitive_assembler = g_state.primitive_assembler;
-
-        if (g_debug_context && g_debug_context->recorder) {
-            for (int i = 0; i < 3; ++i) {
-                const auto texture = regs.texturing.GetTextures()[i];
-                if (!texture.enabled)
-                    continue;
-
-                u8* texture_data = Memory::GetPhysicalPointer(texture.config.GetPhysicalAddress());
-                g_debug_context->recorder->MemoryAccessed(
-                    texture_data, Pica::TexturingRegs::NibblesPerPixel(texture.format) *
-                                      texture.config.width / 2 * texture.config.height,
-                    texture.config.GetPhysicalAddress());
-            }
-        }
-
-        DebugUtils::MemoryAccessTracker memory_accesses;
-
-        // Simple circular-replacement vertex cache
-        // The size has been tuned for optimal balance between hit-rate and the cost of lookup
-        const size_t VERTEX_CACHE_SIZE = 32;
-        std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids;
-        std::array<Shader::AttributeBuffer, VERTEX_CACHE_SIZE> vertex_cache;
-        Shader::AttributeBuffer vs_output;
-
-        unsigned int vertex_cache_pos = 0;
-        vertex_cache_ids.fill(-1);
-
-        auto* shader_engine = Shader::GetEngine();
-        Shader::UnitState shader_unit;
-
-        shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
-
-        g_state.geometry_pipeline.Reconfigure();
-        g_state.geometry_pipeline.Setup(shader_engine);
-        if (g_state.geometry_pipeline.NeedIndexInput())
-            ASSERT(is_indexed);
-
-        for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) {
-            // Indexed rendering doesn't use the start offset
-            unsigned int vertex =
-                is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index])
-                           : (index + regs.pipeline.vertex_offset);
-
-            // -1 is a common special value used for primitive restart. Since it's unknown if
-            // the PICA supports it, and it would mess up the caching, guard against it here.
-            ASSERT(vertex != -1);
-
-            bool vertex_cache_hit = false;
-
-            if (is_indexed) {
-                if (g_state.geometry_pipeline.NeedIndexInput()) {
-                    g_state.geometry_pipeline.SubmitIndex(vertex);
-                    continue;
-                }
-
-                if (g_debug_context && Pica::g_debug_context->recorder) {
-                    int size = index_u16 ? 2 : 1;
-                    memory_accesses.AddAccess(base_address + index_info.offset + size * index,
-                                              size);
-                }
-
-                for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) {
-                    if (vertex == vertex_cache_ids[i]) {
-                        vs_output = vertex_cache[i];
-                        vertex_cache_hit = true;
-                        break;
-                    }
-                }
-            }
-
-            if (!vertex_cache_hit) {
-                // Initialize data for the current vertex
-                Shader::AttributeBuffer input;
-                loader.LoadVertex(base_address, index, vertex, input, memory_accesses);
-
-                // Send to vertex shader
-                if (g_debug_context)
-                    g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
-                                             (void*)&input);
-                shader_unit.LoadInput(regs.vs, input);
-                shader_engine->Run(g_state.vs, shader_unit);
-                shader_unit.WriteOutput(regs.vs, vs_output);
-
-                if (is_indexed) {
-                    vertex_cache[vertex_cache_pos] = vs_output;
-                    vertex_cache_ids[vertex_cache_pos] = vertex;
-                    vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE;
-                }
-            }
-
-            // Send to geometry pipeline
-            g_state.geometry_pipeline.SubmitVertex(vs_output);
-        }
-
-        for (auto& range : memory_accesses.ranges) {
-            g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first),
-                                                      range.second, range.first);
-        }
-
-        VideoCore::g_renderer->Rasterizer()->DrawTriangles();
-        if (g_debug_context) {
-            g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
-        }
-
+    case PICA_REG_INDEX(pipeline.trigger_draw_indexed):
+        Draw(id);
         break;
-    }
 
     case PICA_REG_INDEX(gs.bool_uniforms):
         WriteUniformBoolReg(g_state.gs, g_state.regs.gs.bool_uniforms.Value());
-- 
cgit v1.2.3