diff options
Diffstat (limited to '')
-rw-r--r-- | src/citra_qt/debugger/graphics_tracing.cpp | 2 | ||||
-rw-r--r-- | src/core/hle/kernel/memory.cpp | 1 | ||||
-rw-r--r-- | src/core/hle/kernel/process.h | 7 | ||||
-rw-r--r-- | src/core/hle/kernel/thread.cpp | 84 | ||||
-rw-r--r-- | src/core/hle/kernel/thread.h | 4 | ||||
-rw-r--r-- | src/core/memory.h | 6 | ||||
-rw-r--r-- | src/video_core/command_processor.cpp | 2 | ||||
-rw-r--r-- | src/video_core/pica_state.h | 2 | ||||
-rw-r--r-- | src/video_core/shader/shader.cpp | 2 | ||||
-rw-r--r-- | src/video_core/shader/shader.h | 19 | ||||
-rw-r--r-- | src/video_core/shader/shader_interpreter.cpp | 68 | ||||
-rw-r--r-- | src/video_core/vertex_loader.cpp | 2 |
12 files changed, 121 insertions, 78 deletions
diff --git a/src/citra_qt/debugger/graphics_tracing.cpp b/src/citra_qt/debugger/graphics_tracing.cpp index 1402f8e79..9c80f7ec9 100644 --- a/src/citra_qt/debugger/graphics_tracing.cpp +++ b/src/citra_qt/debugger/graphics_tracing.cpp @@ -74,7 +74,7 @@ void GraphicsTracingWidget::StartRecording() { std::array<u32, 4 * 16> default_attributes; for (unsigned i = 0; i < 16; ++i) { for (unsigned comp = 0; comp < 3; ++comp) { - default_attributes[4 * i + comp] = nihstro::to_float24(Pica::g_state.vs.default_attributes[i][comp].ToFloat32()); + default_attributes[4 * i + comp] = nihstro::to_float24(Pica::g_state.vs_default_attributes[i][comp].ToFloat32()); } } diff --git a/src/core/hle/kernel/memory.cpp b/src/core/hle/kernel/memory.cpp index 862643448..61a741e28 100644 --- a/src/core/hle/kernel/memory.cpp +++ b/src/core/hle/kernel/memory.cpp @@ -109,7 +109,6 @@ struct MemoryArea { static MemoryArea memory_areas[] = { {SHARED_MEMORY_VADDR, SHARED_MEMORY_SIZE, "Shared Memory"}, // Shared memory {VRAM_VADDR, VRAM_SIZE, "VRAM"}, // Video memory (VRAM) - {TLS_AREA_VADDR, TLS_AREA_SIZE, "TLS Area"}, // TLS memory }; } diff --git a/src/core/hle/kernel/process.h b/src/core/hle/kernel/process.h index a06afef2b..d781ef32c 100644 --- a/src/core/hle/kernel/process.h +++ b/src/core/hle/kernel/process.h @@ -142,8 +142,11 @@ public: MemoryRegionInfo* memory_region = nullptr; - /// Bitmask of the used TLS slots - std::bitset<300> used_tls_slots; + /// The Thread Local Storage area is allocated as processes create threads, + /// each TLS area is 0x200 bytes, so one page (0x1000) is split up in 8 parts, and each part + /// holds the TLS for a specific thread. This vector contains which parts are in use for each page as a bitmask. + /// This vector will grow as more pages are allocated for new threads. + std::vector<std::bitset<8>> tls_slots; VAddr GetLinearHeapAreaAddress() const; VAddr GetLinearHeapBase() const; diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp index 6dc95d0f1..68f026918 100644 --- a/src/core/hle/kernel/thread.cpp +++ b/src/core/hle/kernel/thread.cpp @@ -117,9 +117,10 @@ void Thread::Stop() { } wait_objects.clear(); - Kernel::g_current_process->used_tls_slots[tls_index] = false; - g_current_process->misc_memory_used -= Memory::TLS_ENTRY_SIZE; - g_current_process->memory_region->used -= Memory::TLS_ENTRY_SIZE; + // Mark the TLS slot in the thread's page as free. + u32 tls_page = (tls_address - Memory::TLS_AREA_VADDR) / Memory::PAGE_SIZE; + u32 tls_slot = ((tls_address - Memory::TLS_AREA_VADDR) % Memory::PAGE_SIZE) / Memory::TLS_ENTRY_SIZE; + Kernel::g_current_process->tls_slots[tls_page].reset(tls_slot); HLE::Reschedule(__func__); } @@ -366,6 +367,31 @@ static void DebugThreadQueue() { } } +/** + * Finds a free location for the TLS section of a thread. + * @param tls_slots The TLS page array of the thread's owner process. + * Returns a tuple of (page, slot, alloc_needed) where: + * page: The index of the first allocated TLS page that has free slots. + * slot: The index of the first free slot in the indicated page. + * alloc_needed: Whether there's a need to allocate a new TLS page (All pages are full). + */ +std::tuple<u32, u32, bool> GetFreeThreadLocalSlot(std::vector<std::bitset<8>>& tls_slots) { + // Iterate over all the allocated pages, and try to find one where not all slots are used. + for (unsigned page = 0; page < tls_slots.size(); ++page) { + const auto& page_tls_slots = tls_slots[page]; + if (!page_tls_slots.all()) { + // We found a page with at least one free slot, find which slot it is + for (unsigned slot = 0; slot < page_tls_slots.size(); ++slot) { + if (!page_tls_slots.test(slot)) { + return std::make_tuple(page, slot, false); + } + } + } + } + + return std::make_tuple(0, 0, true); +} + ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point, s32 priority, u32 arg, s32 processor_id, VAddr stack_top) { if (priority < THREADPRIO_HIGHEST || priority > THREADPRIO_LOWEST) { @@ -403,22 +429,50 @@ ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point, thread->name = std::move(name); thread->callback_handle = wakeup_callback_handle_table.Create(thread).MoveFrom(); thread->owner_process = g_current_process; - thread->tls_index = -1; thread->waitsynch_waited = false; // Find the next available TLS index, and mark it as used - auto& used_tls_slots = Kernel::g_current_process->used_tls_slots; - for (unsigned int i = 0; i < used_tls_slots.size(); ++i) { - if (used_tls_slots[i] == false) { - thread->tls_index = i; - used_tls_slots[i] = true; - break; + auto& tls_slots = Kernel::g_current_process->tls_slots; + bool needs_allocation = true; + u32 available_page; // Which allocated page has free space + u32 available_slot; // Which slot within the page is free + + std::tie(available_page, available_slot, needs_allocation) = GetFreeThreadLocalSlot(tls_slots); + + if (needs_allocation) { + // There are no already-allocated pages with free slots, lets allocate a new one. + // TLS pages are allocated from the BASE region in the linear heap. + MemoryRegionInfo* memory_region = GetMemoryRegion(MemoryRegion::BASE); + auto& linheap_memory = memory_region->linear_heap_memory; + + if (linheap_memory->size() + Memory::PAGE_SIZE > memory_region->size) { + LOG_ERROR(Kernel_SVC, "Not enough space in region to allocate a new TLS page for thread"); + return ResultCode(ErrorDescription::OutOfMemory, ErrorModule::Kernel, ErrorSummary::OutOfResource, ErrorLevel::Permanent); } + + u32 offset = linheap_memory->size(); + + // Allocate some memory from the end of the linear heap for this region. + linheap_memory->insert(linheap_memory->end(), Memory::PAGE_SIZE, 0); + memory_region->used += Memory::PAGE_SIZE; + Kernel::g_current_process->linear_heap_used += Memory::PAGE_SIZE; + + tls_slots.emplace_back(0); // The page is completely available at the start + available_page = tls_slots.size() - 1; + available_slot = 0; // Use the first slot in the new page + + auto& vm_manager = Kernel::g_current_process->vm_manager; + vm_manager.RefreshMemoryBlockMappings(linheap_memory.get()); + + // Map the page to the current process' address space. + // TODO(Subv): Find the correct MemoryState for this region. + vm_manager.MapMemoryBlock(Memory::TLS_AREA_VADDR + available_page * Memory::PAGE_SIZE, + linheap_memory, offset, Memory::PAGE_SIZE, MemoryState::Private); } - ASSERT_MSG(thread->tls_index != -1, "Out of TLS space"); - g_current_process->misc_memory_used += Memory::TLS_ENTRY_SIZE; - g_current_process->memory_region->used += Memory::TLS_ENTRY_SIZE; + // Mark the slot as used + tls_slots[available_page].set(available_slot); + thread->tls_address = Memory::TLS_AREA_VADDR + available_page * Memory::PAGE_SIZE + available_slot * Memory::TLS_ENTRY_SIZE; // TODO(peachum): move to ScheduleThread() when scheduler is added so selected core is used // to initialize the context @@ -509,10 +563,6 @@ void Thread::SetWaitSynchronizationOutput(s32 output) { context.cpu_registers[1] = output; } -VAddr Thread::GetTLSAddress() const { - return Memory::TLS_AREA_VADDR + tls_index * Memory::TLS_ENTRY_SIZE; -} - //////////////////////////////////////////////////////////////////////////////////////////////////// void ThreadingInit() { diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h index 97ba57fc5..deab5d5a6 100644 --- a/src/core/hle/kernel/thread.h +++ b/src/core/hle/kernel/thread.h @@ -127,7 +127,7 @@ public: * Returns the Thread Local Storage address of the current thread * @returns VAddr of the thread's TLS */ - VAddr GetTLSAddress() const; + VAddr GetTLSAddress() const { return tls_address; } Core::ThreadContext context; @@ -144,7 +144,7 @@ public: s32 processor_id; - s32 tls_index; ///< Index of the Thread Local Storage of the thread + VAddr tls_address; ///< Virtual address of the Thread Local Storage of the thread bool waitsynch_waited; ///< Set to true if the last svcWaitSynch call caused the thread to wait diff --git a/src/core/memory.h b/src/core/memory.h index 9caa3c3f5..126d60471 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -100,15 +100,9 @@ enum : VAddr { SHARED_PAGE_SIZE = 0x00001000, SHARED_PAGE_VADDR_END = SHARED_PAGE_VADDR + SHARED_PAGE_SIZE, - // TODO(yuriks): The size of this area is dynamic, the kernel grows - // it as more and more threads are created. For now we'll just use a - // hardcoded value. /// Area where TLS (Thread-Local Storage) buffers are allocated. TLS_AREA_VADDR = 0x1FF82000, TLS_ENTRY_SIZE = 0x200, - TLS_AREA_SIZE = 300 * TLS_ENTRY_SIZE + 0x800, // Space for up to 300 threads + round to page size - TLS_AREA_VADDR_END = TLS_AREA_VADDR + TLS_AREA_SIZE, - /// Equivalent to LINEAR_HEAP_VADDR, but expanded to cover the extra memory in the New 3DS. NEW_LINEAR_HEAP_VADDR = 0x30000000, diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index e7dc5ddac..ad0da796e 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -128,7 +128,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { // TODO: Verify that this actually modifies the register! if (setup.index < 15) { - g_state.vs.default_attributes[setup.index] = attribute; + g_state.vs_default_attributes[setup.index] = attribute; setup.index++; } else { // Put each attribute into an immediate input buffer. diff --git a/src/video_core/pica_state.h b/src/video_core/pica_state.h index 1059c6ae4..495174c25 100644 --- a/src/video_core/pica_state.h +++ b/src/video_core/pica_state.h @@ -25,6 +25,8 @@ struct State { Shader::ShaderSetup vs; Shader::ShaderSetup gs; + std::array<Math::Vec4<float24>, 16> vs_default_attributes; + struct { union LutEntry { // Used for raw access diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index 449fc703f..e93a9d92a 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -67,7 +67,6 @@ OutputVertex ShaderSetup::Run(UnitState<false>& state, const InputVertex& input, MICROPROFILE_SCOPE(GPU_Shader); - state.program_counter = config.main_offset; state.debug.max_offset = 0; state.debug.max_opdesc_id = 0; @@ -143,7 +142,6 @@ OutputVertex ShaderSetup::Run(UnitState<false>& state, const InputVertex& input, DebugData<true> ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup) { UnitState<true> state; - state.program_counter = config.main_offset; state.debug.max_offset = 0; state.debug.max_opdesc_id = 0; diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 7f417675a..983e4a967 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -272,29 +272,12 @@ struct UnitState { } registers; static_assert(std::is_pod<Registers>::value, "Structure is not POD"); - u32 program_counter; bool conditional_code[2]; // Two Address registers and one loop counter // TODO: How many bits do these actually have? s32 address_registers[3]; - enum { - INVALID_ADDRESS = 0xFFFFFFFF - }; - - struct CallStackElement { - u32 final_address; // Address upon which we jump to return_address - u32 return_address; // Where to jump when leaving scope - u8 repeat_counter; // How often to repeat until this call stack element is removed - u8 loop_increment; // Which value to add to the loop counter after an iteration - // TODO: Should this be a signed value? Does it even matter? - u32 loop_address; // The address where we'll return to after each loop iteration - }; - - // TODO: Is there a maximal size for this? - boost::container::static_vector<CallStackElement, 16> call_stack; - DebugData<Debug> debug; static size_t InputOffset(const SourceRegister& reg) { @@ -340,8 +323,6 @@ struct ShaderSetup { std::array<Math::Vec4<u8>, 4> i; } uniforms; - Math::Vec4<float24> default_attributes[16]; - std::array<u32, 1024> program_code; std::array<u32, 1024> swizzle_data; diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index 7710f7fbc..3a827d11f 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -29,8 +29,24 @@ namespace Pica { namespace Shader { +constexpr u32 INVALID_ADDRESS = 0xFFFFFFFF; + +struct CallStackElement { + u32 final_address; // Address upon which we jump to return_address + u32 return_address; // Where to jump when leaving scope + u8 repeat_counter; // How often to repeat until this call stack element is removed + u8 loop_increment; // Which value to add to the loop counter after an iteration + // TODO: Should this be a signed value? Does it even matter? + u32 loop_address; // The address where we'll return to after each loop iteration +}; + template<bool Debug> void RunInterpreter(UnitState<Debug>& state) { + // TODO: Is there a maximal size for this? + boost::container::static_vector<CallStackElement, 16> call_stack; + + u32 program_counter = g_state.regs.vs.main_offset; + const auto& uniforms = g_state.vs.uniforms; const auto& swizzle_data = g_state.vs.swizzle_data; const auto& program_code = g_state.vs.program_code; @@ -41,16 +57,16 @@ void RunInterpreter(UnitState<Debug>& state) { unsigned iteration = 0; bool exit_loop = false; while (!exit_loop) { - if (!state.call_stack.empty()) { - auto& top = state.call_stack.back(); - if (state.program_counter == top.final_address) { + if (!call_stack.empty()) { + auto& top = call_stack.back(); + if (program_counter == top.final_address) { state.address_registers[2] += top.loop_increment; if (top.repeat_counter-- == 0) { - state.program_counter = top.return_address; - state.call_stack.pop_back(); + program_counter = top.return_address; + call_stack.pop_back(); } else { - state.program_counter = top.loop_address; + program_counter = top.loop_address; } // TODO: Is "trying again" accurate to hardware? @@ -58,20 +74,20 @@ void RunInterpreter(UnitState<Debug>& state) { } } - const Instruction instr = { program_code[state.program_counter] }; + const Instruction instr = { program_code[program_counter] }; const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] }; - static auto call = [](UnitState<Debug>& state, u32 offset, u32 num_instructions, + static auto call = [&program_counter, &call_stack](UnitState<Debug>& state, u32 offset, u32 num_instructions, u32 return_offset, u8 repeat_count, u8 loop_increment) { - state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset - ASSERT(state.call_stack.size() < state.call_stack.capacity()); - state.call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset }); + program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset + ASSERT(call_stack.size() < call_stack.capacity()); + call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset }); }; - Record<DebugDataRecord::CUR_INSTR>(state.debug, iteration, state.program_counter); + Record<DebugDataRecord::CUR_INSTR>(state.debug, iteration, program_counter); if (iteration > 0) - Record<DebugDataRecord::NEXT_INSTR>(state.debug, iteration - 1, state.program_counter); + Record<DebugDataRecord::NEXT_INSTR>(state.debug, iteration - 1, program_counter); - state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + state.program_counter); + state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + program_counter); auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* { switch (source_reg.GetRegisterType()) { @@ -519,7 +535,7 @@ void RunInterpreter(UnitState<Debug>& state) { case OpCode::Id::JMPC: Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code); if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) { - state.program_counter = instr.flow_control.dest_offset - 1; + program_counter = instr.flow_control.dest_offset - 1; } break; @@ -527,7 +543,7 @@ void RunInterpreter(UnitState<Debug>& state) { Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]); if (uniforms.b[instr.flow_control.bool_uniform_id] == !(instr.flow_control.num_instructions & 1)) { - state.program_counter = instr.flow_control.dest_offset - 1; + program_counter = instr.flow_control.dest_offset - 1; } break; @@ -535,7 +551,7 @@ void RunInterpreter(UnitState<Debug>& state) { call(state, instr.flow_control.dest_offset, instr.flow_control.num_instructions, - state.program_counter + 1, 0, 0); + program_counter + 1, 0, 0); break; case OpCode::Id::CALLU: @@ -544,7 +560,7 @@ void RunInterpreter(UnitState<Debug>& state) { call(state, instr.flow_control.dest_offset, instr.flow_control.num_instructions, - state.program_counter + 1, 0, 0); + program_counter + 1, 0, 0); } break; @@ -554,7 +570,7 @@ void RunInterpreter(UnitState<Debug>& state) { call(state, instr.flow_control.dest_offset, instr.flow_control.num_instructions, - state.program_counter + 1, 0, 0); + program_counter + 1, 0, 0); } break; @@ -565,8 +581,8 @@ void RunInterpreter(UnitState<Debug>& state) { Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]); if (uniforms.b[instr.flow_control.bool_uniform_id]) { call(state, - state.program_counter + 1, - instr.flow_control.dest_offset - state.program_counter - 1, + program_counter + 1, + instr.flow_control.dest_offset - program_counter - 1, instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); } else { call(state, @@ -584,8 +600,8 @@ void RunInterpreter(UnitState<Debug>& state) { Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code); if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) { call(state, - state.program_counter + 1, - instr.flow_control.dest_offset - state.program_counter - 1, + program_counter + 1, + instr.flow_control.dest_offset - program_counter - 1, instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); } else { call(state, @@ -607,8 +623,8 @@ void RunInterpreter(UnitState<Debug>& state) { Record<DebugDataRecord::LOOP_INT_IN>(state.debug, iteration, loop_param); call(state, - state.program_counter + 1, - instr.flow_control.dest_offset - state.program_counter + 1, + program_counter + 1, + instr.flow_control.dest_offset - program_counter + 1, instr.flow_control.dest_offset + 1, loop_param.x, loop_param.z); @@ -625,7 +641,7 @@ void RunInterpreter(UnitState<Debug>& state) { } } - ++state.program_counter; + ++program_counter; ++iteration; } } diff --git a/src/video_core/vertex_loader.cpp b/src/video_core/vertex_loader.cpp index 21ae52949..83896814f 100644 --- a/src/video_core/vertex_loader.cpp +++ b/src/video_core/vertex_loader.cpp @@ -124,7 +124,7 @@ void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, Shader::I input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(), input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32()); } else if (vertex_attribute_is_default[i]) { // Load the default attribute if we're configured to do so - input.attr[i] = g_state.vs.default_attributes[i]; + input.attr[i] = g_state.vs_default_attributes[i]; LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)", i, vertex, index, input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(), |