16 files changed, 499 insertions, 120 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 0961a3251..cffa4c952 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -15,6 +15,7 @@ set(SRCS
             shader/shader_interpreter.cpp
             swrasterizer/clipper.cpp
             swrasterizer/framebuffer.cpp
+            swrasterizer/lighting.cpp
             swrasterizer/proctex.cpp
             swrasterizer/rasterizer.cpp
             swrasterizer/swrasterizer.cpp
@@ -55,6 +56,7 @@ set(HEADERS
             shader/shader_interpreter.h
             swrasterizer/clipper.h
             swrasterizer/framebuffer.h
+            swrasterizer/lighting.h
             swrasterizer/proctex.h
             swrasterizer/rasterizer.h
             swrasterizer/swrasterizer.h
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 4633a1df1..f98ca3302 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -119,27 +119,6 @@ static void WriteUniformFloatReg(ShaderRegs& config, Shader::ShaderSetup& setup,
     }
 }
 
-static void WriteProgramCode(ShaderRegs& config, Shader::ShaderSetup& setup,
-                             unsigned max_program_code_length, u32 value) {
-    if (config.program.offset >= max_program_code_length) {
-        LOG_ERROR(HW_GPU, "Invalid %s program offset %d", GetShaderSetupTypeName(setup),
-                  (int)config.program.offset);
-    } else {
-        setup.program_code[config.program.offset] = value;
-        config.program.offset++;
-    }
-}
-
-static void WriteSwizzlePatterns(ShaderRegs& config, Shader::ShaderSetup& setup, u32 value) {
-    if (config.swizzle_patterns.offset >= setup.swizzle_data.size()) {
-        LOG_ERROR(HW_GPU, "Invalid %s swizzle pattern offset %d", GetShaderSetupTypeName(setup),
-                  (int)config.swizzle_patterns.offset);
-    } else {
-        setup.swizzle_data[config.swizzle_patterns.offset] = value;
-        config.swizzle_patterns.offset++;
-    }
-}
-
 static void WritePicaReg(u32 id, u32 value, u32 mask) {
     auto& regs = g_state.regs;
 
@@ -458,7 +437,13 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[5], 0x2a1):
     case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[6], 0x2a2):
     case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[7], 0x2a3): {
-        WriteProgramCode(g_state.regs.gs, g_state.gs, 4096, value);
+        u32& offset = g_state.regs.gs.program.offset;
+        if (offset >= 4096) {
+            LOG_ERROR(HW_GPU, "Invalid GS program offset %u", offset);
+        } else {
+            g_state.gs.program_code[offset] = value;
+            offset++;
+        }
         break;
     }
 
@@ -470,11 +455,18 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[5], 0x2ab):
     case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[6], 0x2ac):
     case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[7], 0x2ad): {
-        WriteSwizzlePatterns(g_state.regs.gs, g_state.gs, value);
+        u32& offset = g_state.regs.gs.swizzle_patterns.offset;
+        if (offset >= g_state.gs.swizzle_data.size()) {
+            LOG_ERROR(HW_GPU, "Invalid GS swizzle pattern offset %u", offset);
+        } else {
+            g_state.gs.swizzle_data[offset] = value;
+            offset++;
+        }
         break;
     }
 
     case PICA_REG_INDEX(vs.bool_uniforms):
+        // TODO (wwylele): does regs.pipeline.gs_unit_exclusive_configuration affect this?
         WriteUniformBoolReg(g_state.vs, g_state.regs.vs.bool_uniforms.Value());
         break;
 
@@ -482,6 +474,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[1], 0x2b2):
     case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[2], 0x2b3):
     case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[3], 0x2b4): {
+        // TODO (wwylele): does regs.pipeline.gs_unit_exclusive_configuration affect this?
         unsigned index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1));
         auto values = regs.vs.int_uniforms[index];
         WriteUniformIntReg(g_state.vs, index,
@@ -497,6 +490,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[5], 0x2c6):
     case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[6], 0x2c7):
     case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[7], 0x2c8): {
+        // TODO (wwylele): does regs.pipeline.gs_unit_exclusive_configuration affect this?
         WriteUniformFloatReg(g_state.regs.vs, g_state.vs, vs_float_regs_counter,
                              vs_uniform_write_buffer, value);
         break;
@@ -510,7 +504,16 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[5], 0x2d1):
     case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[6], 0x2d2):
     case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[7], 0x2d3): {
-        WriteProgramCode(g_state.regs.vs, g_state.vs, 512, value);
+        u32& offset = g_state.regs.vs.program.offset;
+        if (offset >= 512) {
+            LOG_ERROR(HW_GPU, "Invalid VS program offset %u", offset);
+        } else {
+            g_state.vs.program_code[offset] = value;
+            if (!g_state.regs.pipeline.gs_unit_exclusive_configuration) {
+                g_state.gs.program_code[offset] = value;
+            }
+            offset++;
+        }
         break;
     }
 
@@ -522,7 +525,16 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[5], 0x2db):
     case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[6], 0x2dc):
     case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[7], 0x2dd): {
-        WriteSwizzlePatterns(g_state.regs.vs, g_state.vs, value);
+        u32& offset = g_state.regs.vs.swizzle_patterns.offset;
+        if (offset >= g_state.vs.swizzle_data.size()) {
+            LOG_ERROR(HW_GPU, "Invalid VS swizzle pattern offset %u", offset);
+        } else {
+            g_state.vs.swizzle_data[offset] = value;
+            if (!g_state.regs.pipeline.gs_unit_exclusive_configuration) {
+                g_state.gs.swizzle_data[offset] = value;
+            }
+            offset++;
+        }
         break;
     }
 
diff --git a/src/video_core/pica_state.h b/src/video_core/pica_state.h
index 2d23d34e6..864a2c9e6 100644
--- a/src/video_core/pica_state.h
+++ b/src/video_core/pica_state.h
@@ -79,7 +79,7 @@ struct State {
         std::array<ColorDifferenceEntry, 256> color_diff_table;
     } proctex;
 
-    struct {
+    struct Lighting {
         union LutEntry {
             // Used for raw access
             u32 raw;
diff --git a/src/video_core/regs_pipeline.h b/src/video_core/regs_pipeline.h
index 31c747d77..8b6369297 100644
--- a/src/video_core/regs_pipeline.h
+++ b/src/video_core/regs_pipeline.h
@@ -202,7 +202,14 @@ struct PipelineRegs {
     /// Number of input attributes to the vertex shader minus 1
     BitField<0, 4, u32> max_input_attrib_index;
 
-    INSERT_PADDING_WORDS(2);
+    INSERT_PADDING_WORDS(1);
+
+    // The shader unit 3, which can be used for both vertex and geometry shader, gets its
+    // configuration depending on this register. If this is not set, unit 3 will share some
+    // configuration with other units. It is known that program code and swizzle pattern uploaded
+    // via regs.vs will be also uploaded to unit 3 if this is not set. Although very likely, it is
+    // still unclear whether uniforms and other configuration can be also shared.
+    BitField<0, 1, u32> gs_unit_exclusive_configuration;
 
     enum class GPUMode : u32 {
         Drawing = 0,
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index ff3f69ba3..aa95ef21d 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -28,6 +28,9 @@ MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(100, 100, 255));
 MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100));
 
 RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
+    // Clipping plane 0 is always enabled for PICA fixed clip plane z <= 0
+    state.clip_distance[0] = true;
+
     // Create sampler objects
     for (size_t i = 0; i < texture_samplers.size(); ++i) {
         texture_samplers[i].Create();
@@ -117,48 +120,53 @@ RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
 
     // Setup the noise LUT for proctex
     proctex_noise_lut.Create();
-    state.proctex_noise_lut.texture_1d = proctex_noise_lut.handle;
+    state.proctex_noise_lut.texture_buffer = proctex_noise_lut.handle;
     state.Apply();
+    proctex_noise_lut_buffer.Create();
+    glBindBuffer(GL_TEXTURE_BUFFER, proctex_noise_lut_buffer.handle);
+    glBufferData(GL_TEXTURE_BUFFER, sizeof(GLfloat) * 2 * 128, nullptr, GL_DYNAMIC_DRAW);
     glActiveTexture(TextureUnits::ProcTexNoiseLUT.Enum());
-    glTexImage1D(GL_TEXTURE_1D, 0, GL_RG32F, 128, 0, GL_RG, GL_FLOAT, nullptr);
-    glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+    glTexBuffer(GL_TEXTURE_BUFFER, GL_RG32F, proctex_noise_lut_buffer.handle);
 
     // Setup the color map for proctex
     proctex_color_map.Create();
-    state.proctex_color_map.texture_1d = proctex_color_map.handle;
+    state.proctex_color_map.texture_buffer = proctex_color_map.handle;
     state.Apply();
+    proctex_color_map_buffer.Create();
+    glBindBuffer(GL_TEXTURE_BUFFER, proctex_color_map_buffer.handle);
+    glBufferData(GL_TEXTURE_BUFFER, sizeof(GLfloat) * 2 * 128, nullptr, GL_DYNAMIC_DRAW);
     glActiveTexture(TextureUnits::ProcTexColorMap.Enum());
-    glTexImage1D(GL_TEXTURE_1D, 0, GL_RG32F, 128, 0, GL_RG, GL_FLOAT, nullptr);
-    glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+    glTexBuffer(GL_TEXTURE_BUFFER, GL_RG32F, proctex_color_map_buffer.handle);
 
     // Setup the alpha map for proctex
     proctex_alpha_map.Create();
-    state.proctex_alpha_map.texture_1d = proctex_alpha_map.handle;
+    state.proctex_alpha_map.texture_buffer = proctex_alpha_map.handle;
     state.Apply();
+    proctex_alpha_map_buffer.Create();
+    glBindBuffer(GL_TEXTURE_BUFFER, proctex_alpha_map_buffer.handle);
+    glBufferData(GL_TEXTURE_BUFFER, sizeof(GLfloat) * 2 * 128, nullptr, GL_DYNAMIC_DRAW);
     glActiveTexture(TextureUnits::ProcTexAlphaMap.Enum());
-    glTexImage1D(GL_TEXTURE_1D, 0, GL_RG32F, 128, 0, GL_RG, GL_FLOAT, nullptr);
-    glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+    glTexBuffer(GL_TEXTURE_BUFFER, GL_RG32F, proctex_alpha_map_buffer.handle);
 
     // Setup the LUT for proctex
     proctex_lut.Create();
-    state.proctex_lut.texture_1d = proctex_lut.handle;
+    state.proctex_lut.texture_buffer = proctex_lut.handle;
     state.Apply();
+    proctex_lut_buffer.Create();
+    glBindBuffer(GL_TEXTURE_BUFFER, proctex_lut_buffer.handle);
+    glBufferData(GL_TEXTURE_BUFFER, sizeof(GLfloat) * 4 * 256, nullptr, GL_DYNAMIC_DRAW);
     glActiveTexture(TextureUnits::ProcTexLUT.Enum());
-    glTexImage1D(GL_TEXTURE_1D, 0, GL_RGBA32F, 256, 0, GL_RGBA, GL_FLOAT, nullptr);
-    glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+    glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA32F, proctex_lut_buffer.handle);
 
     // Setup the difference LUT for proctex
     proctex_diff_lut.Create();
-    state.proctex_diff_lut.texture_1d = proctex_diff_lut.handle;
+    state.proctex_diff_lut.texture_buffer = proctex_diff_lut.handle;
     state.Apply();
+    proctex_diff_lut_buffer.Create();
+    glBindBuffer(GL_TEXTURE_BUFFER, proctex_diff_lut_buffer.handle);
+    glBufferData(GL_TEXTURE_BUFFER, sizeof(GLfloat) * 4 * 256, nullptr, GL_DYNAMIC_DRAW);
     glActiveTexture(TextureUnits::ProcTexDiffLUT.Enum());
-    glTexImage1D(GL_TEXTURE_1D, 0, GL_RGBA32F, 256, 0, GL_RGBA, GL_FLOAT, nullptr);
-    glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+    glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA32F, proctex_diff_lut_buffer.handle);
 
     // Sync fixed function OpenGL state
     SyncCullMode();
@@ -1387,7 +1395,7 @@ void RasterizerOpenGL::SyncProcTexNoise() {
 
 // helper function for SyncProcTexNoiseLUT/ColorMap/AlphaMap
 static void SyncProcTexValueLUT(const std::array<Pica::State::ProcTex::ValueEntry, 128>& lut,
-                                std::array<GLvec2, 128>& lut_data, GLenum texture) {
+                                std::array<GLvec2, 128>& lut_data, GLuint buffer) {
     std::array<GLvec2, 128> new_data;
     std::transform(lut.begin(), lut.end(), new_data.begin(), [](const auto& entry) {
         return GLvec2{entry.ToFloat(), entry.DiffToFloat()};
@@ -1395,24 +1403,24 @@ static void SyncProcTexValueLUT(const std::array<Pica::State::ProcTex::ValueEntr
 
     if (new_data != lut_data) {
         lut_data = new_data;
-        glActiveTexture(texture);
-        glTexSubImage1D(GL_TEXTURE_1D, 0, 0, 128, GL_RG, GL_FLOAT, lut_data.data());
+        glBindBuffer(GL_TEXTURE_BUFFER, buffer);
+        glBufferSubData(GL_TEXTURE_BUFFER, 0, new_data.size() * sizeof(GLvec2), new_data.data());
     }
 }
 
 void RasterizerOpenGL::SyncProcTexNoiseLUT() {
     SyncProcTexValueLUT(Pica::g_state.proctex.noise_table, proctex_noise_lut_data,
-                        TextureUnits::ProcTexNoiseLUT.Enum());
+                        proctex_noise_lut_buffer.handle);
 }
 
 void RasterizerOpenGL::SyncProcTexColorMap() {
     SyncProcTexValueLUT(Pica::g_state.proctex.color_map_table, proctex_color_map_data,
-                        TextureUnits::ProcTexColorMap.Enum());
+                        proctex_color_map_buffer.handle);
 }
 
 void RasterizerOpenGL::SyncProcTexAlphaMap() {
     SyncProcTexValueLUT(Pica::g_state.proctex.alpha_map_table, proctex_alpha_map_data,
-                        TextureUnits::ProcTexAlphaMap.Enum());
+                        proctex_alpha_map_buffer.handle);
 }
 
 void RasterizerOpenGL::SyncProcTexLUT() {
@@ -1427,8 +1435,8 @@ void RasterizerOpenGL::SyncProcTexLUT() {
 
     if (new_data != proctex_lut_data) {
         proctex_lut_data = new_data;
-        glActiveTexture(TextureUnits::ProcTexLUT.Enum());
-        glTexSubImage1D(GL_TEXTURE_1D, 0, 0, 256, GL_RGBA, GL_FLOAT, proctex_lut_data.data());
+        glBindBuffer(GL_TEXTURE_BUFFER, proctex_lut_buffer.handle);
+        glBufferSubData(GL_TEXTURE_BUFFER, 0, new_data.size() * sizeof(GLvec4), new_data.data());
     }
 }
 
@@ -1444,8 +1452,8 @@ void RasterizerOpenGL::SyncProcTexDiffLUT() {
 
     if (new_data != proctex_diff_lut_data) {
         proctex_diff_lut_data = new_data;
-        glActiveTexture(TextureUnits::ProcTexDiffLUT.Enum());
-        glTexSubImage1D(GL_TEXTURE_1D, 0, 0, 256, GL_RGBA, GL_FLOAT, proctex_diff_lut_data.data());
+        glBindBuffer(GL_TEXTURE_BUFFER, proctex_diff_lut_buffer.handle);
+        glBufferSubData(GL_TEXTURE_BUFFER, 0, new_data.size() * sizeof(GLvec4), new_data.data());
     }
 }
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index a433c1d4a..78e218efe 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -287,18 +287,23 @@ private:
     OGLTexture fog_lut;
     std::array<GLvec2, 128> fog_lut_data{};
 
+    OGLBuffer proctex_noise_lut_buffer;
     OGLTexture proctex_noise_lut;
     std::array<GLvec2, 128> proctex_noise_lut_data{};
 
+    OGLBuffer proctex_color_map_buffer;
     OGLTexture proctex_color_map;
     std::array<GLvec2, 128> proctex_color_map_data{};
 
+    OGLBuffer proctex_alpha_map_buffer;
     OGLTexture proctex_alpha_map;
     std::array<GLvec2, 128> proctex_alpha_map_data{};
 
+    OGLBuffer proctex_lut_buffer;
     OGLTexture proctex_lut;
     std::array<GLvec4, 256> proctex_lut_data{};
 
+    OGLBuffer proctex_diff_lut_buffer;
     OGLTexture proctex_diff_lut;
     std::array<GLvec4, 256> proctex_diff_lut_data{};
 };
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 8b717e43d..f37894e7a 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -542,10 +542,11 @@ RasterizerCacheOpenGL::GetFramebufferSurfaces(
             config.GetDepthBufferPhysicalAddress(),
             fb_area * Pica::FramebufferRegs::BytesPerDepthPixel(config.depth_format));
     bool using_color_fb = config.GetColorBufferPhysicalAddress() != 0;
-    bool using_depth_fb =
-        config.GetDepthBufferPhysicalAddress() != 0 &&
-        (regs.framebuffer.output_merger.depth_test_enable ||
-         regs.framebuffer.output_merger.depth_write_enable || !framebuffers_overlap);
+    bool depth_write_enable = regs.framebuffer.output_merger.depth_write_enable &&
+                              regs.framebuffer.framebuffer.allow_depth_stencil_write;
+    bool using_depth_fb = config.GetDepthBufferPhysicalAddress() != 0 &&
+                          (regs.framebuffer.output_merger.depth_test_enable || depth_write_enable ||
+                           !framebuffers_overlap);
 
     if (framebuffers_overlap && using_color_fb && using_depth_fb) {
         LOG_CRITICAL(Render_OpenGL, "Color and depth framebuffer memory regions overlap; "
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index c93b108fb..015e69da9 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -525,11 +525,12 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
            "float geo_factor = 1.0;\n";
 
     // Compute fragment normals and tangents
-    const std::string pertubation =
-        "2.0 * (" + SampleTexture(config, lighting.bump_selector) + ").rgb - 1.0";
+    auto Perturbation = [&]() {
+        return "2.0 * (" + SampleTexture(config, lighting.bump_selector) + ").rgb - 1.0";
+    };
     if (lighting.bump_mode == LightingRegs::LightingBumpMode::NormalMap) {
         // Bump mapping is enabled using a normal map
-        out += "vec3 surface_normal = " + pertubation + ";\n";
+        out += "vec3 surface_normal = " + Perturbation() + ";\n";
 
         // Recompute Z-component of perturbation if 'renorm' is enabled, this provides a higher
         // precision result
@@ -543,7 +544,7 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
         out += "vec3 surface_tangent = vec3(1.0, 0.0, 0.0);\n";
     } else if (lighting.bump_mode == LightingRegs::LightingBumpMode::TangentMap) {
         // Bump mapping is enabled using a tangent map
-        out += "vec3 surface_tangent = " + pertubation + ";\n";
+        out += "vec3 surface_tangent = " + Perturbation() + ";\n";
         // Mathematically, recomputing Z-component of the tangent vector won't affect the relevant
         // computation below, which is also confirmed on 3DS. So we don't bother recomputing here
         // even if 'renorm' is enabled.
@@ -886,12 +887,12 @@ void AppendProcTexSampler(std::string& out, const PicaShaderConfig& config) {
     // coord=1.0 is lut[127]+lut_diff[127]. For other indices, the result is interpolated using
     // value entries and difference entries.
     out += R"(
-float ProcTexLookupLUT(sampler1D lut, float coord) {
+float ProcTexLookupLUT(samplerBuffer lut, float coord) {
     coord *= 128;
     float index_i = clamp(floor(coord), 0.0, 127.0);
     float index_f = coord - index_i; // fract() cannot be used here because 128.0 needs to be
                                      // extracted as index_i = 127.0 and index_f = 1.0
-    vec2 entry = texelFetch(lut, int(index_i), 0).rg;
+    vec2 entry = texelFetch(lut, int(index_i)).rg;
     return clamp(entry.r + entry.g * index_f, 0.0, 1.0);
 }
     )";
@@ -979,14 +980,14 @@ float ProcTexNoiseCoef(vec2 x) {
         out += "int lut_index_i = int(lut_coord) + " +
                std::to_string(config.state.proctex.lut_offset) + ";\n";
         out += "float lut_index_f = fract(lut_coord);\n";
-        out += "vec4 final_color = texelFetch(proctex_lut, lut_index_i, 0) + lut_index_f * "
-               "texelFetch(proctex_diff_lut, lut_index_i, 0);\n";
+        out += "vec4 final_color = texelFetch(proctex_lut, lut_index_i) + lut_index_f * "
+               "texelFetch(proctex_diff_lut, lut_index_i);\n";
         break;
     case ProcTexFilter::Nearest:
     case ProcTexFilter::NearestMipmapLinear:
     case ProcTexFilter::NearestMipmapNearest:
         out += "lut_coord += " + std::to_string(config.state.proctex.lut_offset) + ";\n";
-        out += "vec4 final_color = texelFetch(proctex_lut, int(round(lut_coord)), 0);\n";
+        out += "vec4 final_color = texelFetch(proctex_lut, int(round(lut_coord)));\n";
         break;
     }
 
@@ -1053,11 +1054,11 @@ layout (std140) uniform shader_data {
 uniform sampler2D tex[3];
 uniform samplerBuffer lighting_lut;
 uniform samplerBuffer fog_lut;
-uniform sampler1D proctex_noise_lut;
-uniform sampler1D proctex_color_map;
-uniform sampler1D proctex_alpha_map;
-uniform sampler1D proctex_lut;
-uniform sampler1D proctex_diff_lut;
+uniform samplerBuffer proctex_noise_lut;
+uniform samplerBuffer proctex_color_map;
+uniform samplerBuffer proctex_alpha_map;
+uniform samplerBuffer proctex_lut;
+uniform samplerBuffer proctex_diff_lut;
 
 // Rotate the vector v by the quaternion q
 vec3 quaternion_rotate(vec4 q, vec3 v) {
@@ -1111,7 +1112,10 @@ vec4 secondary_fragment_color = vec4(0.0);
                "gl_FragCoord.y < scissor_y2)) discard;\n";
     }
 
-    out += "float z_over_w = 1.0 - gl_FragCoord.z * 2.0;\n";
+    // After perspective divide, OpenGL transform z_over_w from [-1, 1] to [near, far]. Here we use
+    // default near = 0 and far = 1, and undo the transformation to get the original z_over_w, then
+    // do our own transformation according to PICA specification.
+    out += "float z_over_w = 2.0 * gl_FragCoord.z - 1.0;\n";
     out += "float depth = z_over_w * depth_scale + depth_offset;\n";
     if (state.depthmap_enable == RasterizerRegs::DepthBuffering::WBuffering) {
         out += "depth /= gl_FragCoord.w;\n";
@@ -1194,7 +1198,9 @@ void main() {
     texcoord0_w = vert_texcoord0_w;
     normquat = vert_normquat;
     view = vert_view;
-    gl_Position = vec4(vert_position.x, vert_position.y, -vert_position.z, vert_position.w);
+    gl_Position = vert_position;
+    gl_ClipDistance[0] = -vert_position.z; // fixed PICA clipping plane z <= 0
+    // TODO (wwylele): calculate gl_ClipDistance[1] from user-defined clipping plane
 }
 )";
 
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index eface2dea..06a905766 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -56,11 +56,11 @@ OpenGLState::OpenGLState() {
 
     fog_lut.texture_buffer = 0;
 
-    proctex_lut.texture_1d = 0;
-    proctex_diff_lut.texture_1d = 0;
-    proctex_color_map.texture_1d = 0;
-    proctex_alpha_map.texture_1d = 0;
-    proctex_noise_lut.texture_1d = 0;
+    proctex_lut.texture_buffer = 0;
+    proctex_diff_lut.texture_buffer = 0;
+    proctex_color_map.texture_buffer = 0;
+    proctex_alpha_map.texture_buffer = 0;
+    proctex_noise_lut.texture_buffer = 0;
 
     draw.read_framebuffer = 0;
     draw.draw_framebuffer = 0;
@@ -68,6 +68,8 @@ OpenGLState::OpenGLState() {
     draw.vertex_buffer = 0;
     draw.uniform_buffer = 0;
     draw.shader_program = 0;
+
+    clip_distance = {};
 }
 
 void OpenGLState::Apply() const {
@@ -204,33 +206,33 @@ void OpenGLState::Apply() const {
     }
 
     // ProcTex Noise LUT
-    if (proctex_noise_lut.texture_1d != cur_state.proctex_noise_lut.texture_1d) {
+    if (proctex_noise_lut.texture_buffer != cur_state.proctex_noise_lut.texture_buffer) {
         glActiveTexture(TextureUnits::ProcTexNoiseLUT.Enum());
-        glBindTexture(GL_TEXTURE_1D, proctex_noise_lut.texture_1d);
+        glBindTexture(GL_TEXTURE_BUFFER, proctex_noise_lut.texture_buffer);
     }
 
     // ProcTex Color Map
-    if (proctex_color_map.texture_1d != cur_state.proctex_color_map.texture_1d) {
+    if (proctex_color_map.texture_buffer != cur_state.proctex_color_map.texture_buffer) {
         glActiveTexture(TextureUnits::ProcTexColorMap.Enum());
-        glBindTexture(GL_TEXTURE_1D, proctex_color_map.texture_1d);
+        glBindTexture(GL_TEXTURE_BUFFER, proctex_color_map.texture_buffer);
     }
 
     // ProcTex Alpha Map
-    if (proctex_alpha_map.texture_1d != cur_state.proctex_alpha_map.texture_1d) {
+    if (proctex_alpha_map.texture_buffer != cur_state.proctex_alpha_map.texture_buffer) {
         glActiveTexture(TextureUnits::ProcTexAlphaMap.Enum());
-        glBindTexture(GL_TEXTURE_1D, proctex_alpha_map.texture_1d);
+        glBindTexture(GL_TEXTURE_BUFFER, proctex_alpha_map.texture_buffer);
     }
 
     // ProcTex LUT
-    if (proctex_lut.texture_1d != cur_state.proctex_lut.texture_1d) {
+    if (proctex_lut.texture_buffer != cur_state.proctex_lut.texture_buffer) {
         glActiveTexture(TextureUnits::ProcTexLUT.Enum());
-        glBindTexture(GL_TEXTURE_1D, proctex_lut.texture_1d);
+        glBindTexture(GL_TEXTURE_BUFFER, proctex_lut.texture_buffer);
     }
 
     // ProcTex Diff LUT
-    if (proctex_diff_lut.texture_1d != cur_state.proctex_diff_lut.texture_1d) {
+    if (proctex_diff_lut.texture_buffer != cur_state.proctex_diff_lut.texture_buffer) {
         glActiveTexture(TextureUnits::ProcTexDiffLUT.Enum());
-        glBindTexture(GL_TEXTURE_1D, proctex_diff_lut.texture_1d);
+        glBindTexture(GL_TEXTURE_BUFFER, proctex_diff_lut.texture_buffer);
     }
 
     // Framebuffer
@@ -261,6 +263,17 @@ void OpenGLState::Apply() const {
         glUseProgram(draw.shader_program);
     }
 
+    // Clip distance
+    for (size_t i = 0; i < clip_distance.size(); ++i) {
+        if (clip_distance[i] != cur_state.clip_distance[i]) {
+            if (clip_distance[i]) {
+                glEnable(GL_CLIP_DISTANCE0 + i);
+            } else {
+                glDisable(GL_CLIP_DISTANCE0 + i);
+            }
+        }
+    }
+
     cur_state = *this;
 }
 
@@ -274,16 +287,16 @@ void OpenGLState::ResetTexture(GLuint handle) {
         cur_state.lighting_lut.texture_buffer = 0;
     if (cur_state.fog_lut.texture_buffer == handle)
         cur_state.fog_lut.texture_buffer = 0;
-    if (cur_state.proctex_noise_lut.texture_1d == handle)
-        cur_state.proctex_noise_lut.texture_1d = 0;
-    if (cur_state.proctex_color_map.texture_1d == handle)
-        cur_state.proctex_color_map.texture_1d = 0;
-    if (cur_state.proctex_alpha_map.texture_1d == handle)
-        cur_state.proctex_alpha_map.texture_1d = 0;
-    if (cur_state.proctex_lut.texture_1d == handle)
-        cur_state.proctex_lut.texture_1d = 0;
-    if (cur_state.proctex_diff_lut.texture_1d == handle)
-        cur_state.proctex_diff_lut.texture_1d = 0;
+    if (cur_state.proctex_noise_lut.texture_buffer == handle)
+        cur_state.proctex_noise_lut.texture_buffer = 0;
+    if (cur_state.proctex_color_map.texture_buffer == handle)
+        cur_state.proctex_color_map.texture_buffer = 0;
+    if (cur_state.proctex_alpha_map.texture_buffer == handle)
+        cur_state.proctex_alpha_map.texture_buffer = 0;
+    if (cur_state.proctex_lut.texture_buffer == handle)
+        cur_state.proctex_lut.texture_buffer = 0;
+    if (cur_state.proctex_diff_lut.texture_buffer == handle)
+        cur_state.proctex_diff_lut.texture_buffer = 0;
 }
 
 void OpenGLState::ResetSampler(GLuint handle) {
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 1efcf0811..437fe34c4 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <array>
 #include <glad/glad.h>
 
 namespace TextureUnits {
@@ -95,23 +96,23 @@ public:
     } fog_lut;
 
     struct {
-        GLuint texture_1d; // GL_TEXTURE_BINDING_1D
+        GLuint texture_buffer; // GL_TEXTURE_BINDING_BUFFER
     } proctex_noise_lut;
 
     struct {
-        GLuint texture_1d; // GL_TEXTURE_BINDING_1D
+        GLuint texture_buffer; // GL_TEXTURE_BINDING_BUFFER
     } proctex_color_map;
 
     struct {
-        GLuint texture_1d; // GL_TEXTURE_BINDING_1D
+        GLuint texture_buffer; // GL_TEXTURE_BINDING_BUFFER
     } proctex_alpha_map;
 
     struct {
-        GLuint texture_1d; // GL_TEXTURE_BINDING_1D
+        GLuint texture_buffer; // GL_TEXTURE_BINDING_BUFFER
     } proctex_lut;
 
     struct {
-        GLuint texture_1d; // GL_TEXTURE_BINDING_1D
+        GLuint texture_buffer; // GL_TEXTURE_BINDING_BUFFER
     } proctex_diff_lut;
 
     struct {
@@ -123,6 +124,8 @@ public:
         GLuint shader_program;   // GL_CURRENT_PROGRAM
     } draw;
 
+    std::array<bool, 2> clip_distance; // GL_CLIP_DISTANCE
+
     OpenGLState();
 
     /// Get the currently active OpenGL state
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index aa1cec81f..206c0978a 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -631,7 +631,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
                 state.address_registers[2] = loop_param.y;
 
                 Record<DebugDataRecord::LOOP_INT_IN>(debug_data, iteration, loop_param);
-                call(program_counter + 1, instr.flow_control.dest_offset - program_counter + 1,
+                call(program_counter + 1, instr.flow_control.dest_offset - program_counter,
                      instr.flow_control.dest_offset + 1, loop_param.x, loop_param.z);
                 break;
             }
diff --git a/src/video_core/swrasterizer/clipper.cpp b/src/video_core/swrasterizer/clipper.cpp
index 6fb923756..cdbc71502 100644
--- a/src/video_core/swrasterizer/clipper.cpp
+++ b/src/video_core/swrasterizer/clipper.cpp
@@ -95,6 +95,17 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
     static const size_t MAX_VERTICES = 9;
     static_vector<Vertex, MAX_VERTICES> buffer_a = {v0, v1, v2};
     static_vector<Vertex, MAX_VERTICES> buffer_b;
+
+    auto FlipQuaternionIfOpposite = [](auto& a, const auto& b) {
+        if (Math::Dot(a, b) < float24::Zero())
+            a = -a;
+    };
+
+    // Flip the quaternions if they are opposite to prevent interpolating them over the wrong
+    // direction.
+    FlipQuaternionIfOpposite(buffer_a[1].quat, buffer_a[0].quat);
+    FlipQuaternionIfOpposite(buffer_a[2].quat, buffer_a[0].quat);
+
     auto* output_list = &buffer_a;
     auto* input_list = &buffer_b;
 
@@ -114,10 +125,6 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
         {Math::MakeVec(f0, f0, f0, -f1), Math::Vec4<float24>(f0, f0, f0, EPSILON)}, // w = EPSILON
     }};
 
-    // TODO: If one vertex lies outside one of the depth clipping planes, some platforms (e.g. Wii)
-    //       drop the whole primitive instead of clipping the primitive properly. We should test if
-    //       this happens on the 3DS, too.
-
     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
     // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
     for (auto edge : clipping_edges) {
diff --git a/src/video_core/swrasterizer/lighting.cpp b/src/video_core/swrasterizer/lighting.cpp
new file mode 100644
index 000000000..39a3e396d
--- /dev/null
+++ b/src/video_core/swrasterizer/lighting.cpp
@@ -0,0 +1,278 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/math_util.h"
+#include "video_core/swrasterizer/lighting.h"
+
+namespace Pica {
+
+static float LookupLightingLut(const Pica::State::Lighting& lighting, size_t lut_index, u8 index,
+                               float delta) {
+    ASSERT_MSG(lut_index < lighting.luts.size(), "Out of range lut");
+    ASSERT_MSG(index < lighting.luts[lut_index].size(), "Out of range index");
+
+    const auto& lut = lighting.luts[lut_index][index];
+
+    float lut_value = lut.ToFloat();
+    float lut_diff = lut.DiffToFloat();
+
+    return lut_value + lut_diff * delta;
+}
+
+std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
+    const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state,
+    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view) {
+
+    // TODO(Subv): Bump mapping
+    Math::Vec3<float> surface_normal = {0.0f, 0.0f, 1.0f};
+
+    if (lighting.config0.bump_mode != LightingRegs::LightingBumpMode::None) {
+        LOG_CRITICAL(HW_GPU, "unimplemented bump mapping");
+        UNIMPLEMENTED();
+    }
+
+    // Use the normalized the quaternion when performing the rotation
+    auto normal = Math::QuaternionRotate(normquat, surface_normal);
+
+    Math::Vec4<float> diffuse_sum = {0.0f, 0.0f, 0.0f, 1.0f};
+    Math::Vec4<float> specular_sum = {0.0f, 0.0f, 0.0f, 1.0f};
+
+    for (unsigned light_index = 0; light_index <= lighting.max_light_index; ++light_index) {
+        unsigned num = lighting.light_enable.GetNum(light_index);
+        const auto& light_config = lighting.light[num];
+
+        Math::Vec3<float> refl_value = {};
+        Math::Vec3<float> position = {float16::FromRaw(light_config.x).ToFloat32(),
+                                      float16::FromRaw(light_config.y).ToFloat32(),
+                                      float16::FromRaw(light_config.z).ToFloat32()};
+        Math::Vec3<float> light_vector;
+
+        if (light_config.config.directional)
+            light_vector = position;
+        else
+            light_vector = position + view;
+
+        light_vector.Normalize();
+
+        Math::Vec3<float> norm_view = view.Normalized();
+        Math::Vec3<float> half_vector = norm_view + light_vector;
+
+        float dist_atten = 1.0f;
+        if (!lighting.IsDistAttenDisabled(num)) {
+            auto distance = (-view - position).Length();
+            float scale = Pica::float20::FromRaw(light_config.dist_atten_scale).ToFloat32();
+            float bias = Pica::float20::FromRaw(light_config.dist_atten_bias).ToFloat32();
+            size_t lut =
+                static_cast<size_t>(LightingRegs::LightingSampler::DistanceAttenuation) + num;
+
+            float sample_loc = MathUtil::Clamp(scale * distance + bias, 0.0f, 1.0f);
+
+            u8 lutindex =
+                static_cast<u8>(MathUtil::Clamp(std::floor(sample_loc * 256.0f), 0.0f, 255.0f));
+            float delta = sample_loc * 256 - lutindex;
+            dist_atten = LookupLightingLut(lighting_state, lut, lutindex, delta);
+        }
+
+        auto GetLutValue = [&](LightingRegs::LightingLutInput input, bool abs,
+                               LightingRegs::LightingScale scale_enum,
+                               LightingRegs::LightingSampler sampler) {
+            float result = 0.0f;
+
+            switch (input) {
+            case LightingRegs::LightingLutInput::NH:
+                result = Math::Dot(normal, half_vector.Normalized());
+                break;
+
+            case LightingRegs::LightingLutInput::VH:
+                result = Math::Dot(norm_view, half_vector.Normalized());
+                break;
+
+            case LightingRegs::LightingLutInput::NV:
+                result = Math::Dot(normal, norm_view);
+                break;
+
+            case LightingRegs::LightingLutInput::LN:
+                result = Math::Dot(light_vector, normal);
+                break;
+
+            case LightingRegs::LightingLutInput::SP: {
+                Math::Vec3<s32> spot_dir{light_config.spot_x.Value(), light_config.spot_y.Value(),
+                                         light_config.spot_z.Value()};
+                result = Math::Dot(light_vector, spot_dir.Cast<float>() / 2047.0f);
+                break;
+            }
+            default:
+                LOG_CRITICAL(HW_GPU, "Unknown lighting LUT input %u\n", static_cast<u32>(input));
+                UNIMPLEMENTED();
+                result = 0.0f;
+            }
+
+            u8 index;
+            float delta;
+
+            if (abs) {
+                if (light_config.config.two_sided_diffuse)
+                    result = std::abs(result);
+                else
+                    result = std::max(result, 0.0f);
+
+                float flr = std::floor(result * 256.0f);
+                index = static_cast<u8>(MathUtil::Clamp(flr, 0.0f, 255.0f));
+                delta = result * 256 - index;
+            } else {
+                float flr = std::floor(result * 128.0f);
+                s8 signed_index = static_cast<s8>(MathUtil::Clamp(flr, -128.0f, 127.0f));
+                delta = result * 128.0f - signed_index;
+                index = static_cast<u8>(signed_index);
+            }
+
+            float scale = lighting.lut_scale.GetScale(scale_enum);
+            return scale *
+                   LookupLightingLut(lighting_state, static_cast<size_t>(sampler), index, delta);
+        };
+
+        // If enabled, compute spot light attenuation value
+        float spot_atten = 1.0f;
+        if (!lighting.IsSpotAttenDisabled(num) &&
+            LightingRegs::IsLightingSamplerSupported(
+                lighting.config0.config, LightingRegs::LightingSampler::SpotlightAttenuation)) {
+            auto lut = LightingRegs::SpotlightAttenuationSampler(num);
+            spot_atten = GetLutValue(lighting.lut_input.sp, lighting.abs_lut_input.disable_sp == 0,
+                                     lighting.lut_scale.sp, lut);
+        }
+
+        // Specular 0 component
+        float d0_lut_value = 1.0f;
+        if (lighting.config1.disable_lut_d0 == 0 &&
+            LightingRegs::IsLightingSamplerSupported(
+                lighting.config0.config, LightingRegs::LightingSampler::Distribution0)) {
+            d0_lut_value =
+                GetLutValue(lighting.lut_input.d0, lighting.abs_lut_input.disable_d0 == 0,
+                            lighting.lut_scale.d0, LightingRegs::LightingSampler::Distribution0);
+        }
+
+        Math::Vec3<float> specular_0 = d0_lut_value * light_config.specular_0.ToVec3f();
+
+        // If enabled, lookup ReflectRed value, otherwise, 1.0 is used
+        if (lighting.config1.disable_lut_rr == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::ReflectRed)) {
+            refl_value.x =
+                GetLutValue(lighting.lut_input.rr, lighting.abs_lut_input.disable_rr == 0,
+                            lighting.lut_scale.rr, LightingRegs::LightingSampler::ReflectRed);
+        } else {
+            refl_value.x = 1.0f;
+        }
+
+        // If enabled, lookup ReflectGreen value, otherwise, ReflectRed value is used
+        if (lighting.config1.disable_lut_rg == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::ReflectGreen)) {
+            refl_value.y =
+                GetLutValue(lighting.lut_input.rg, lighting.abs_lut_input.disable_rg == 0,
+                            lighting.lut_scale.rg, LightingRegs::LightingSampler::ReflectGreen);
+        } else {
+            refl_value.y = refl_value.x;
+        }
+
+        // If enabled, lookup ReflectBlue value, otherwise, ReflectRed value is used
+        if (lighting.config1.disable_lut_rb == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::ReflectBlue)) {
+            refl_value.z =
+                GetLutValue(lighting.lut_input.rb, lighting.abs_lut_input.disable_rb == 0,
+                            lighting.lut_scale.rb, LightingRegs::LightingSampler::ReflectBlue);
+        } else {
+            refl_value.z = refl_value.x;
+        }
+
+        // Specular 1 component
+        float d1_lut_value = 1.0f;
+        if (lighting.config1.disable_lut_d1 == 0 &&
+            LightingRegs::IsLightingSamplerSupported(
+                lighting.config0.config, LightingRegs::LightingSampler::Distribution1)) {
+            d1_lut_value =
+                GetLutValue(lighting.lut_input.d1, lighting.abs_lut_input.disable_d1 == 0,
+                            lighting.lut_scale.d1, LightingRegs::LightingSampler::Distribution1);
+        }
+
+        Math::Vec3<float> specular_1 =
+            d1_lut_value * refl_value * light_config.specular_1.ToVec3f();
+
+        // Fresnel
+        if (lighting.config1.disable_lut_fr == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::Fresnel)) {
+
+            float lut_value =
+                GetLutValue(lighting.lut_input.fr, lighting.abs_lut_input.disable_fr == 0,
+                            lighting.lut_scale.fr, LightingRegs::LightingSampler::Fresnel);
+
+            // Enabled for diffuse lighting alpha component
+            if (lighting.config0.fresnel_selector ==
+                    LightingRegs::LightingFresnelSelector::PrimaryAlpha ||
+                lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
+                diffuse_sum.a() *= lut_value;
+            }
+
+            // Enabled for the specular lighting alpha component
+            if (lighting.config0.fresnel_selector ==
+                    LightingRegs::LightingFresnelSelector::SecondaryAlpha ||
+                lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
+                specular_sum.a() *= lut_value;
+            }
+        }
+
+        auto dot_product = Math::Dot(light_vector, normal);
+
+        // Calculate clamp highlights before applying the two-sided diffuse configuration to the dot
+        // product.
+        float clamp_highlights = 1.0f;
+        if (lighting.config0.clamp_highlights) {
+            if (dot_product <= 0.0f)
+                clamp_highlights = 0.0f;
+            else
+                clamp_highlights = 1.0f;
+        }
+
+        if (light_config.config.two_sided_diffuse)
+            dot_product = std::abs(dot_product);
+        else
+            dot_product = std::max(dot_product, 0.0f);
+
+        if (light_config.config.geometric_factor_0 || light_config.config.geometric_factor_1) {
+            float geo_factor = half_vector.Length2();
+            geo_factor = geo_factor == 0.0f ? 0.0f : std::min(dot_product / geo_factor, 1.0f);
+            if (light_config.config.geometric_factor_0) {
+                specular_0 *= geo_factor;
+            }
+            if (light_config.config.geometric_factor_1) {
+                specular_1 *= geo_factor;
+            }
+        }
+
+        auto diffuse =
+            light_config.diffuse.ToVec3f() * dot_product + light_config.ambient.ToVec3f();
+        diffuse_sum += Math::MakeVec(diffuse * dist_atten * spot_atten, 0.0f);
+
+        specular_sum += Math::MakeVec(
+            (specular_0 + specular_1) * clamp_highlights * dist_atten * spot_atten, 0.0f);
+    }
+
+    diffuse_sum += Math::MakeVec(lighting.global_ambient.ToVec3f(), 0.0f);
+
+    auto diffuse = Math::MakeVec<float>(MathUtil::Clamp(diffuse_sum.x, 0.0f, 1.0f) * 255,
+                                        MathUtil::Clamp(diffuse_sum.y, 0.0f, 1.0f) * 255,
+                                        MathUtil::Clamp(diffuse_sum.z, 0.0f, 1.0f) * 255,
+                                        MathUtil::Clamp(diffuse_sum.w, 0.0f, 1.0f) * 255)
+                       .Cast<u8>();
+    auto specular = Math::MakeVec<float>(MathUtil::Clamp(specular_sum.x, 0.0f, 1.0f) * 255,
+                                         MathUtil::Clamp(specular_sum.y, 0.0f, 1.0f) * 255,
+                                         MathUtil::Clamp(specular_sum.z, 0.0f, 1.0f) * 255,
+                                         MathUtil::Clamp(specular_sum.w, 0.0f, 1.0f) * 255)
+                        .Cast<u8>();
+    return std::make_tuple(diffuse, specular);
+}
+
+} // namespace Pica
diff --git a/src/video_core/swrasterizer/lighting.h b/src/video_core/swrasterizer/lighting.h
new file mode 100644
index 000000000..438dca926
--- /dev/null
+++ b/src/video_core/swrasterizer/lighting.h
@@ -0,0 +1,18 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <tuple>
+#include "common/quaternion.h"
+#include "common/vector_math.h"
+#include "video_core/pica_state.h"
+
+namespace Pica {
+
+std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
+    const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state,
+    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view);
+
+} // namespace Pica
diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index 512e81c08..fdc1df199 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -13,6 +13,7 @@
 #include "common/logging/log.h"
 #include "common/math_util.h"
 #include "common/microprofile.h"
+#include "common/quaternion.h"
 #include "common/vector_math.h"
 #include "core/hw/gpu.h"
 #include "core/memory.h"
@@ -24,6 +25,7 @@
 #include "video_core/regs_texturing.h"
 #include "video_core/shader/shader.h"
 #include "video_core/swrasterizer/framebuffer.h"
+#include "video_core/swrasterizer/lighting.h"
 #include "video_core/swrasterizer/proctex.h"
 #include "video_core/swrasterizer/rasterizer.h"
 #include "video_core/swrasterizer/texturing.h"
@@ -419,6 +421,26 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
                 regs.texturing.tev_combiner_buffer_color.a,
             };
 
+            Math::Vec4<u8> primary_fragment_color = {0, 0, 0, 0};
+            Math::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0};
+
+            if (!g_state.regs.lighting.disable) {
+                Math::Quaternion<float> normquat = Math::Quaternion<float>{
+                    {GetInterpolatedAttribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(),
+                     GetInterpolatedAttribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(),
+                     GetInterpolatedAttribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()},
+                    GetInterpolatedAttribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(),
+                }.Normalized();
+
+                Math::Vec3<float> view{
+                    GetInterpolatedAttribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(),
+                    GetInterpolatedAttribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(),
+                    GetInterpolatedAttribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(),
+                };
+                std::tie(primary_fragment_color, secondary_fragment_color) =
+                    ComputeFragmentsColors(g_state.regs.lighting, g_state.lighting, normquat, view);
+            }
+
             for (unsigned tev_stage_index = 0; tev_stage_index < tev_stages.size();
                  ++tev_stage_index) {
                 const auto& tev_stage = tev_stages[tev_stage_index];
@@ -427,14 +449,13 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
                 auto GetSource = [&](Source source) -> Math::Vec4<u8> {
                     switch (source) {
                     case Source::PrimaryColor:
+                        return primary_color;
 
-                    // HACK: Until we implement fragment lighting, use primary_color
                     case Source::PrimaryFragmentColor:
-                        return primary_color;
+                        return primary_fragment_color;
 
-                    // HACK: Until we implement fragment lighting, use zero
                     case Source::SecondaryFragmentColor:
-                        return {0, 0, 0, 0};
+                        return secondary_fragment_color;
 
                     case Source::Texture0:
                         return texture_color[0];
diff --git a/src/video_core/swrasterizer/rasterizer.h b/src/video_core/swrasterizer/rasterizer.h
index 2f0877581..66cd6cfd4 100644
--- a/src/video_core/swrasterizer/rasterizer.h
+++ b/src/video_core/swrasterizer/rasterizer.h
@@ -19,10 +19,9 @@ struct Vertex : Shader::OutputVertex {
 
     // Linear interpolation
     // factor: 0=this, 1=vtx
+    // Note: This function cannot be called after perspective divide
     void Lerp(float24 factor, const Vertex& vtx) {
         pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor);
-
-        // TODO: Should perform perspective correct interpolation here...
         quat = quat * factor + vtx.quat * (float24::FromFloat32(1) - factor);
         color = color * factor + vtx.color * (float24::FromFloat32(1) - factor);
         tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor);
@@ -30,12 +29,11 @@ struct Vertex : Shader::OutputVertex {
         tc0_w = tc0_w * factor + vtx.tc0_w * (float24::FromFloat32(1) - factor);
         view = view * factor + vtx.view * (float24::FromFloat32(1) - factor);
         tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor);
-
-        screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor);
     }
 
     // Linear interpolation
     // factor: 0=v0, 1=v1
+    // Note: This function cannot be called after perspective divide
     static Vertex Lerp(float24 factor, const Vertex& v0, const Vertex& v1) {
         Vertex ret = v0;
         ret.Lerp(factor, v1);