From b78030b02fc4e0a288423400ec7e91e47e757644 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tur=C3=A1nszki=20J=C3=A1nos?= Date: Fri, 19 Jul 2024 06:56:14 +0200 Subject: [PATCH] async queue updates (#885) --- .github/workflows/build-nightly.yml | 15 +-- WickedEngine/shaders/ShaderInterop.h | 5 +- WickedEngine/wiGraphicsDevice.h | 8 ++ WickedEngine/wiGraphicsDevice_DX12.cpp | 164 ++++++++++++----------- WickedEngine/wiGraphicsDevice_DX12.h | 45 ++++++- WickedEngine/wiGraphicsDevice_Vulkan.cpp | 119 +++++++++++----- WickedEngine/wiGraphicsDevice_Vulkan.h | 34 ++++- WickedEngine/wiOcean.cpp | 17 ++- WickedEngine/wiOcean.h | 2 + WickedEngine/wiRenderPath3D.cpp | 123 ++++++++++------- WickedEngine/wiRenderer.cpp | 57 ++++++-- WickedEngine/wiRenderer.h | 7 +- WickedEngine/wiScene.cpp | 10 +- WickedEngine/wiScene_Components.h | 1 + 14 files changed, 403 insertions(+), 204 deletions(-) diff --git a/.github/workflows/build-nightly.yml b/.github/workflows/build-nightly.yml index 98332a3db..0c206627a 100644 --- a/.github/workflows/build-nightly.yml +++ b/.github/workflows/build-nightly.yml @@ -51,24 +51,17 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: actions/cache@v4 - with: - path: ~/.cache/ccache - key: ccache-${{ github.run_id }} - restore-keys: ccache - save-always: true - - name: Install dependencies run: | sudo apt update - sudo apt install libsdl2-dev ccache + sudo apt install libsdl2-dev - name: Initial compile run: | mkdir build cd build - cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - CCACHE_NODIRECT=1 make -j$(nproc) + cmake .. -DCMAKE_BUILD_TYPE=Release + make -j$(nproc) - name: Generate shader dump run: | @@ -79,7 +72,7 @@ jobs: - name: Recompile with shader dump run: | cd build - CCACHE_NODIRECT=1 make -B -j $(nproc) + make -B -j $(nproc) - name: Move files run: | diff --git a/WickedEngine/shaders/ShaderInterop.h b/WickedEngine/shaders/ShaderInterop.h index 6cfaee761..16e8cffbb 100644 --- a/WickedEngine/shaders/ShaderInterop.h +++ b/WickedEngine/shaders/ShaderInterop.h @@ -132,13 +132,12 @@ static const uint IndirectDispatchArgsAlignment = 4u; #define CBSLOT_OTHER_EMITTEDPARTICLE 4 #define CBSLOT_OTHER_HAIRPARTICLE 4 #define CBSLOT_OTHER_FFTGENERATOR 4 -#define CBSLOT_OTHER_OCEAN_SIMULATION_IMMUTABLE 4 -#define CBSLOT_OTHER_OCEAN_SIMULATION_PERFRAME 5 -#define CBSLOT_OTHER_OCEAN_RENDER 7 +#define CBSLOT_OTHER_OCEAN 4 #define CBSLOT_OTHER_CLOUDGENERATOR 4 #define CBSLOT_OTHER_GPUSORTLIB 4 #define CBSLOT_MSAO 4 #define CBSLOT_FSR 4 +#define CBSLOT_TRAILRENDERER 4 #endif // !__PSSL__ && !__SCE__ #endif // WI_SHADERINTEROP_H diff --git a/WickedEngine/wiGraphicsDevice.h b/WickedEngine/wiGraphicsDevice.h index cd1005d00..c02ee38f4 100644 --- a/WickedEngine/wiGraphicsDevice.h +++ b/WickedEngine/wiGraphicsDevice.h @@ -131,12 +131,14 @@ namespace wi::graphics // Returns whether the graphics debug layer is enabled. It can be enabled when creating the device. constexpr bool IsDebugDevice() const { return validationMode != ValidationMode::Disabled; } + // Get GPU-specific metrics: constexpr size_t GetShaderIdentifierSize() const { return SHADER_IDENTIFIER_SIZE; } constexpr size_t GetTopLevelAccelerationStructureInstanceSize() const { return TOPLEVEL_ACCELERATION_STRUCTURE_INSTANCE_SIZE; } constexpr uint32_t GetVariableRateShadingTileSize() const { return VARIABLE_RATE_SHADING_TILE_SIZE; } constexpr uint64_t GetTimestampFrequency() const { return TIMESTAMP_FREQUENCY; } constexpr uint64_t GetVideoDecodeBitstreamAlignment() const { return VIDEO_DECODE_BITSTREAM_ALIGNMENT; } + // Get information about the graphics device manufacturer: constexpr uint32_t GetVendorId() const { return vendorId; } constexpr uint32_t GetDeviceId() const { return deviceId; } constexpr const std::string& GetAdapterName() const { return adapterName; } @@ -178,7 +180,13 @@ namespace wi::graphics // - These commands are not immediately executed, but they begin executing on the GPU after calling SubmitCommandLists() // - These are not thread safe, only a single thread should use a single CommandList at one time + // Tell the command list to wait for an other command list which was started before it + // The granularity of this is at least that the beginning of the command list will wait for the end of the other command list + // On some platform like PS5 this can be implemented by waiting exactly at the wait insertion point within the command lists which is more precise virtual void WaitCommandList(CommandList cmd, CommandList wait_for) = 0; + // Tell the command list to wait for the specified queue to finish processing + // It is useful when you want to wait for a previous frame, or just don't know which command list to wait for + virtual void WaitQueue(CommandList cmd, QUEUE_TYPE wait_for) = 0; virtual void RenderPassBegin(const SwapChain* swapchain, CommandList cmd) = 0; virtual void RenderPassBegin(const RenderPassImage* images, uint32_t image_count, CommandList cmd, RenderPassFlags flags = RenderPassFlags::NONE) = 0; virtual void RenderPassEnd(CommandList cmd) = 0; diff --git a/WickedEngine/wiGraphicsDevice_DX12.cpp b/WickedEngine/wiGraphicsDevice_DX12.cpp index 21d43c397..afbdef221 100644 --- a/WickedEngine/wiGraphicsDevice_DX12.cpp +++ b/WickedEngine/wiGraphicsDevice_DX12.cpp @@ -1610,7 +1610,38 @@ namespace dx12_internal } using namespace dx12_internal; - +#ifdef PLATFORM_XBOX +std::mutex queue_locker; +#endif // PLATFORM_XBOX + + void GraphicsDevice_DX12::CommandQueue::signal(const Semaphore& semaphore) + { + if (queue == nullptr) + return; + HRESULT hr = queue->Signal(semaphore.fence.Get(), semaphore.fenceValue); + assert(SUCCEEDED(hr)); + } + void GraphicsDevice_DX12::CommandQueue::wait(const Semaphore& semaphore) + { + if (queue == nullptr) + return; + HRESULT hr = queue->Wait(semaphore.fence.Get(), semaphore.fenceValue); + assert(SUCCEEDED(hr)); + } + void GraphicsDevice_DX12::CommandQueue::submit() + { + if (queue == nullptr) + return; + if (submit_cmds.empty()) + return; + + queue->ExecuteCommandLists( + (UINT)submit_cmds.size(), + submit_cmds.data() + ); + + submit_cmds.clear(); + } void GraphicsDevice_DX12::CopyAllocator::init(GraphicsDevice_DX12* device) { @@ -2461,15 +2492,6 @@ using namespace dx12_internal; } hr = queues[QUEUE_GRAPHICS].queue->SetName(L"QUEUE_GRAPHICS"); assert(SUCCEEDED(hr)); - hr = device->CreateFence(0, D3D12_FENCE_FLAG_NONE, PPV_ARGS(queues[QUEUE_GRAPHICS].fence)); - assert(SUCCEEDED(hr)); - if (FAILED(hr)) - { - std::stringstream ss(""); - ss << "ID3D12Device::CreateFence[QUEUE_GRAPHICS] failed! ERROR: 0x" << std::hex << hr; - wi::helper::messageBox(ss.str(), "Error!"); - wi::platform::Exit(); - } } { @@ -2488,15 +2510,6 @@ using namespace dx12_internal; } hr = queues[QUEUE_COMPUTE].queue->SetName(L"QUEUE_COMPUTE"); assert(SUCCEEDED(hr)); - hr = device->CreateFence(0, D3D12_FENCE_FLAG_NONE, PPV_ARGS(queues[QUEUE_COMPUTE].fence)); - assert(SUCCEEDED(hr)); - if (FAILED(hr)) - { - std::stringstream ss(""); - ss << "ID3D12Device::CreateFence[QUEUE_COMPUTE] failed! ERROR: 0x" << std::hex << hr; - wi::helper::messageBox(ss.str(), "Error!"); - wi::platform::Exit(); - } } { @@ -2515,15 +2528,6 @@ using namespace dx12_internal; } hr = queues[QUEUE_COPY].queue->SetName(L"QUEUE_COPY"); assert(SUCCEEDED(hr)); - hr = device->CreateFence(0, D3D12_FENCE_FLAG_NONE, PPV_ARGS(queues[QUEUE_COPY].fence)); - assert(SUCCEEDED(hr)); - if (FAILED(hr)) - { - std::stringstream ss(""); - ss << "ID3D12Device::CreateFence[QUEUE_COPY] failed! ERROR: 0x" << std::hex << hr; - wi::helper::messageBox(ss.str(), "Error!"); - wi::platform::Exit(); - } } if (SUCCEEDED(device.As(&video_device))) @@ -2539,15 +2543,6 @@ using namespace dx12_internal; capabilities |= GraphicsDeviceCapability::VIDEO_DECODE_H264; hr = queues[QUEUE_VIDEO_DECODE].queue->SetName(L"QUEUE_VIDEO_DECODE"); assert(SUCCEEDED(hr)); - hr = device->CreateFence(0, D3D12_FENCE_FLAG_NONE, PPV_ARGS(queues[QUEUE_VIDEO_DECODE].fence)); - assert(SUCCEEDED(hr)); - if (FAILED(hr)) - { - std::stringstream ss(""); - ss << "ID3D12Device::CreateFence[QUEUE_VIDEO_DECODE] failed! ERROR: 0x" << std::hex << hr; - wi::helper::messageBox(ss.str(), "Error!"); - wi::platform::Exit(); - } } } @@ -3552,10 +3547,11 @@ using namespace dx12_internal; { resourcedesc.Flags |= D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS; } - if (has_flag(desc->misc_flags, ResourceMiscFlag::VIDEO_DECODE)) + if (!has_flag(desc->bind_flags, BindFlag::DEPTH_STENCIL) && resourcedesc.SampleDesc.Count <= 1) { - // Because video queue can only transition from/to VIDEO_ and COMMON states, we will use COMMON internally and rely on implicit transition for DPB textures - // (See how the resource barrier on video queue overrides any user specified state into COMMON) + // The copy and video queues have much stricter requirements to supported resource states, but they support + // implicit promotion from COMMON state. Because user is not allowed to set resource to COMMON state, we use this flag + // so textures automatically decay to COMMON state at the queue submit when they are left in a read-only state resourcedesc.Flags |= D3D12_RESOURCE_FLAG_ALLOW_SIMULTANEOUS_ACCESS; } @@ -5261,7 +5257,6 @@ using namespace dx12_internal; commandlist.reset(GetBufferIndex()); commandlist.queue = queue; commandlist.id = cmd_current; - commandlist.waited_on.store(false); if (commandlist.GetCommandList() == nullptr) { @@ -5382,38 +5377,56 @@ using namespace dx12_internal; assert(SUCCEEDED(hr)); CommandQueue& queue = queues[commandlist.queue]; + const bool dependency = !commandlist.signals.empty() || !commandlist.waits.empty() || !commandlist.wait_queues.empty(); + + if (dependency) + { + // If the current commandlist must resolve a dependency, then previous ones will be submitted before doing that: + // This improves GPU utilization because not the whole batch of command lists will need to synchronize, but only the one that handles it + queue.submit(); + } + queue.submit_cmds.push_back(commandlist.GetCommandList()); - if (commandlist.waited_on.load() || !commandlist.waits.empty()) + if (dependency) { - for (auto& wait : commandlist.waits) + for (auto& wait : commandlist.wait_queues) { - // record wait for signal on a previous submit: - const CommandList_DX12& waitcommandlist = GetCommandList(wait); - hr = queue.queue->Wait( - queues[waitcommandlist.queue].fence.Get(), - FRAMECOUNT * commandlists.size() + (uint64_t)waitcommandlist.id - ); - assert(SUCCEEDED(hr)); - } + CommandQueue& waitqueue = queues[wait.first]; + const Semaphore& semaphore = wait.second; - if (!queue.submit_cmds.empty()) - { - queue.queue->ExecuteCommandLists( - (UINT)queue.submit_cmds.size(), - queue.submit_cmds.data() - ); - queue.submit_cmds.clear(); - } + // The WaitQueue operation will submit and signal the specified dependency queue: + waitqueue.submit(); + waitqueue.signal(semaphore); // signals immediately after submit - if (commandlist.waited_on.load()) - { - hr = queue.queue->Signal( - queue.fence.Get(), - FRAMECOUNT * commandlists.size() + (uint64_t)commandlist.id - ); - assert(SUCCEEDED(hr)); + // The current queue will be waiting for the dependency queue to complete: + queue.wait(semaphore); + + // recycle semaphore: + free_semaphore(semaphore); } + commandlist.wait_queues.clear(); + + for(auto& semaphore : commandlist.waits) + { + // Wait for command list dependency: + queue.wait(semaphore); + + // semaphore is not recycled here, only the signals recycle themselves vecause wait will use the same + } + commandlist.waits.clear(); + + queue.submit(); + + for(auto& semaphore : commandlist.signals) + { + // Signal this command list's completion: + queue.signal(semaphore); + + // recycle semaphore: + free_semaphore(semaphore); + } + commandlist.signals.clear(); } for (auto& x : commandlist.pipelines_worker) @@ -5439,14 +5452,7 @@ using namespace dx12_internal; if (queue.queue == nullptr) continue; - if (!queue.submit_cmds.empty()) - { - queue.queue->ExecuteCommandLists( - (UINT)queue.submit_cmds.size(), - queue.submit_cmds.data() - ); - queue.submit_cmds.clear(); - } + queue.submit(); hr = queue.queue->Signal(frame_fence[GetBufferIndex()][q].Get(), 1); assert(SUCCEEDED(hr)); @@ -5947,8 +5953,14 @@ using namespace dx12_internal; CommandList_DX12& commandlist = GetCommandList(cmd); CommandList_DX12& commandlist_wait_for = GetCommandList(wait_for); assert(commandlist_wait_for.id < commandlist.id); // can't wait for future command list! - commandlist.waits.push_back(wait_for); - commandlist_wait_for.waited_on.store(true); + Semaphore semaphore = new_semaphore(); + commandlist.waits.push_back(semaphore); + commandlist_wait_for.signals.push_back(semaphore); + } + void GraphicsDevice_DX12::WaitQueue(CommandList cmd, QUEUE_TYPE wait_for) + { + CommandList_DX12& commandlist = GetCommandList(cmd); + commandlist.wait_queues.push_back(std::make_pair(wait_for, new_semaphore())); } void GraphicsDevice_DX12::RenderPassBegin(const SwapChain* swapchain, CommandList cmd) { diff --git a/WickedEngine/wiGraphicsDevice_DX12.h b/WickedEngine/wiGraphicsDevice_DX12.h index e14939336..5d76f9eb1 100644 --- a/WickedEngine/wiGraphicsDevice_DX12.h +++ b/WickedEngine/wiGraphicsDevice_DX12.h @@ -71,17 +71,22 @@ namespace wi::graphics D3D12_CPU_DESCRIPTOR_HANDLE nullUAV = {}; D3D12_CPU_DESCRIPTOR_HANDLE nullSAM = {}; + struct Semaphore + { + Microsoft::WRL::ComPtr fence; + uint64_t fenceValue = 0; + }; + struct CommandQueue { D3D12_COMMAND_QUEUE_DESC desc = {}; Microsoft::WRL::ComPtr queue; - Microsoft::WRL::ComPtr fence; wi::vector submit_cmds; - } queues[QUEUE_COUNT]; -#ifdef PLATFORM_XBOX - std::mutex queue_locker; -#endif // PLATFORM_XBOX + void signal(const Semaphore& semaphore); + void wait(const Semaphore& semaphore); + void submit(); + } queues[QUEUE_COUNT]; struct CopyAllocator { @@ -124,6 +129,28 @@ namespace wi::graphics void flush(bool graphics, CommandList cmd); }; + wi::vector semaphore_pool; + std::mutex semaphore_pool_locker; + Semaphore new_semaphore() + { + std::scoped_lock lck(semaphore_pool_locker); + if (semaphore_pool.empty()) + { + Semaphore& dependency = semaphore_pool.emplace_back(); + HRESULT hr = device->CreateFence(0, D3D12_FENCE_FLAG_NONE, PPV_ARGS(dependency.fence)); + assert(SUCCEEDED(hr)); + } + Semaphore semaphore = std::move(semaphore_pool.back()); + semaphore_pool.pop_back(); + semaphore.fenceValue++; + return semaphore; + } + void free_semaphore(const Semaphore& semaphore) + { + std::scoped_lock lck(semaphore_pool_locker); + semaphore_pool.push_back(semaphore); + } + struct CommandList_DX12 { Microsoft::WRL::ComPtr commandAllocators[BUFFERCOUNT][QUEUE_COUNT]; @@ -133,8 +160,9 @@ namespace wi::graphics QUEUE_TYPE queue = {}; uint32_t id = 0; - wi::vector waits; - std::atomic_bool waited_on{ false }; + wi::vector> wait_queues; + wi::vector waits; + wi::vector signals; DescriptorBinder binder; GPULinearAllocator frame_allocators[BUFFERCOUNT]; @@ -176,7 +204,9 @@ namespace wi::graphics void reset(uint32_t bufferindex) { buffer_index = bufferindex; + wait_queues.clear(); waits.clear(); + signals.clear(); binder.reset(); frame_allocators[buffer_index].reset(); prev_pt = D3D_PRIMITIVE_TOPOLOGY_UNDEFINED; @@ -336,6 +366,7 @@ namespace wi::graphics ///////////////Thread-sensitive//////////////////////// void WaitCommandList(CommandList cmd, CommandList wait_for) override; + void WaitQueue(CommandList cmd, QUEUE_TYPE wait_for) override; void RenderPassBegin(const SwapChain* swapchain, CommandList cmd) override; void RenderPassBegin(const RenderPassImage* images, uint32_t image_count, CommandList cmd, RenderPassFlags flags = RenderPassFlags::NONE) override; void RenderPassEnd(CommandList cmd) override; diff --git a/WickedEngine/wiGraphicsDevice_Vulkan.cpp b/WickedEngine/wiGraphicsDevice_Vulkan.cpp index 8d3d01672..f08b90580 100644 --- a/WickedEngine/wiGraphicsDevice_Vulkan.cpp +++ b/WickedEngine/wiGraphicsDevice_Vulkan.cpp @@ -348,16 +348,20 @@ namespace vulkan_internal case ResourceState::UNORDERED_ACCESS: return VK_IMAGE_LAYOUT_GENERAL; case ResourceState::COPY_SRC: - return VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; case ResourceState::COPY_DST: - return VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + // we can't assume transfer layout because it's allowed for resource to be used by multiple queues like DX12 (decay to common state), so this is a workaround + // the problem is that image copy commands will require specifying the current layout, but different queues can often use textures in different layouts + return VK_IMAGE_LAYOUT_GENERAL; case ResourceState::SHADING_RATE_SOURCE: return VK_IMAGE_LAYOUT_FRAGMENT_SHADING_RATE_ATTACHMENT_OPTIMAL_KHR; case ResourceState::VIDEO_DECODE_SRC: case ResourceState::VIDEO_DECODE_DST: return VK_IMAGE_LAYOUT_VIDEO_DECODE_DPB_KHR; default: - return VK_IMAGE_LAYOUT_UNDEFINED; + // combination of state flags will default to general + // whether the combination of states is valid needs to be validated by the user + // combining read-only states should be fine + return VK_IMAGE_LAYOUT_GENERAL; } } constexpr VkShaderStageFlags _ConvertStageFlags(ShaderStage value) @@ -741,6 +745,7 @@ namespace vulkan_internal std::shared_ptr allocationhandler; VmaAllocation allocation = nullptr; VkImage resource = VK_NULL_HANDLE; + VkImageLayout defaultLayout = VK_IMAGE_LAYOUT_GENERAL; VkBuffer staging_resource = VK_NULL_HANDLE; struct TextureSubresource { @@ -1321,6 +1326,26 @@ using namespace vulkan_internal; + void GraphicsDevice_Vulkan::CommandQueue::signal(VkSemaphore semaphore) + { + if (queue == VK_NULL_HANDLE) + return; + VkSemaphoreSubmitInfo& signalSemaphore = submit_signalSemaphoreInfos.emplace_back(); + signalSemaphore.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO; + signalSemaphore.semaphore = semaphore; + signalSemaphore.value = 0; // not a timeline semaphore + signalSemaphore.stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT; + } + void GraphicsDevice_Vulkan::CommandQueue::wait(VkSemaphore semaphore) + { + if (queue == VK_NULL_HANDLE) + return; + VkSemaphoreSubmitInfo& waitSemaphore = submit_waitSemaphoreInfos.emplace_back(); + waitSemaphore.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO; + waitSemaphore.semaphore = semaphore; + waitSemaphore.value = 0; // not a timeline semaphore + waitSemaphore.stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT; + } void GraphicsDevice_Vulkan::CommandQueue::submit(GraphicsDevice_Vulkan* device, VkFence fence) { if (queue == VK_NULL_HANDLE) @@ -1848,8 +1873,7 @@ using namespace vulkan_internal; auto texture_internal = to_internal((const Texture*)&resource); auto& subresource_descriptor = subresource >= 0 ? texture_internal->subresources_srv[subresource] : texture_internal->srv; imageInfos.back().imageView = subresource_descriptor.image_view; - - imageInfos.back().imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + imageInfos.back().imageLayout = texture_internal->defaultLayout; } } break; @@ -3607,13 +3631,17 @@ using namespace vulkan_internal; { x.destroy(); } - vkDestroySemaphore(device, commandlist->semaphore, nullptr); } for (auto& x : pipelines_global) { vkDestroyPipeline(device, x.second, nullptr); } + for (auto& x : semaphore_pool) + { + vkDestroySemaphore(device, x, nullptr); + } + vmaDestroyBuffer(allocationhandler->allocator, nullBuffer, nullBufferAllocation); vkDestroyBufferView(device, nullBufferView, nullptr); vmaDestroyImage(allocationhandler->allocator, nullImage1D, nullImageAllocation1D); @@ -4056,6 +4084,7 @@ using namespace vulkan_internal; { auto internal_state = std::make_shared(); internal_state->allocationhandler = allocationhandler; + internal_state->defaultLayout = _ConvertImageLayout(desc->layout); texture->internal_state = internal_state; texture->type = GPUResource::Type::TEXTURE; texture->mapped_data = nullptr; @@ -7045,7 +7074,6 @@ using namespace vulkan_internal; commandlist.reset(GetBufferIndex()); commandlist.queue = queue; commandlist.id = cmd_current; - commandlist.waited_on.store(false); if (commandlist.GetCommandBuffer() == VK_NULL_HANDLE) { @@ -7090,11 +7118,6 @@ using namespace vulkan_internal; commandlist.binder_pools[buffer].init(this); } - VkSemaphoreCreateInfo createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; - res = vkCreateSemaphore(device, &createInfo, nullptr, &commandlist.semaphore); - assert(res == VK_SUCCESS); - commandlist.binder.init(this); } @@ -7157,6 +7180,14 @@ using namespace vulkan_internal; assert(res == VK_SUCCESS); CommandQueue& queue = queues[commandlist.queue]; + const bool dependency = !commandlist.signals.empty() || !commandlist.waits.empty() || !commandlist.wait_queues.empty(); + + if (dependency) + { + // If the current commandlist must resolve a dependency, then previous ones will be submitted before doing that: + // This improves GPU utilization because not the whole batch of command lists will need to synchronize, but only the one that handles it + queue.submit(this, VK_NULL_HANDLE); + } VkCommandBufferSubmitInfo& cbSubmitInfo = queue.submit_cmds.emplace_back(); cbSubmitInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO; @@ -7183,29 +7214,43 @@ using namespace vulkan_internal; signalSemaphore.value = 0; // not a timeline semaphore } - if (commandlist.waited_on.load() || !commandlist.waits.empty()) + if (dependency) { - for (auto& wait : commandlist.waits) + for (auto& wait : commandlist.wait_queues) + { + CommandQueue& waitqueue = queues[wait.first]; + VkSemaphore semaphore = wait.second; + + // The WaitQueue operation will submit and signal the specified dependency queue: + waitqueue.signal(semaphore); // signal recorded, will be executed at submit + waitqueue.submit(this, VK_NULL_HANDLE); + + // The current queue will be waiting for the dependency queue to complete: + queue.wait(semaphore); + + // recycle semaphore + free_semaphore(semaphore); + } + commandlist.wait_queues.clear(); + + for (auto& semaphore : commandlist.waits) { // Wait for command list dependency: - CommandList_Vulkan& waitcommandlist = GetCommandList(wait); + queue.wait(semaphore); - VkSemaphoreSubmitInfo& waitSemaphore = queue.submit_waitSemaphoreInfos.emplace_back(); - waitSemaphore.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO; - waitSemaphore.semaphore = waitcommandlist.semaphore; - waitSemaphore.value = 0; // not a timeline semaphore - waitSemaphore.stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT; + // semaphore is not recycled here, only the signals recycle themselves vecause wait will use the same } + commandlist.waits.clear(); - if (commandlist.waited_on.load()) + for (auto& semaphore : commandlist.signals) { // Signal this command list's completion: - VkSemaphoreSubmitInfo& signalSemaphore = queue.submit_signalSemaphoreInfos.emplace_back(); - signalSemaphore.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO; - signalSemaphore.semaphore = commandlist.semaphore; - signalSemaphore.value = 0; // not a timeline semaphore - signalSemaphore.stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT; + queue.signal(semaphore); + + // recycle semaphore + free_semaphore(semaphore); } + commandlist.signals.clear(); queue.submit(this, VK_NULL_HANDLE); } @@ -7556,8 +7601,14 @@ using namespace vulkan_internal; CommandList_Vulkan& commandlist = GetCommandList(cmd); CommandList_Vulkan& commandlist_wait_for = GetCommandList(wait_for); assert(commandlist_wait_for.id < commandlist.id); // can't wait for future command list! - commandlist.waits.push_back(wait_for); - commandlist_wait_for.waited_on.store(true); + VkSemaphore semaphore = new_semaphore(); + commandlist.waits.push_back(semaphore); + commandlist_wait_for.signals.push_back(semaphore); + } + void GraphicsDevice_Vulkan::WaitQueue(CommandList cmd, QUEUE_TYPE wait_for) + { + CommandList_Vulkan& commandlist = GetCommandList(cmd); + commandlist.wait_queues.push_back(std::make_pair(wait_for, new_semaphore())); } void GraphicsDevice_Vulkan::RenderPassBegin(const SwapChain* swapchain, CommandList cmd) { @@ -8435,7 +8486,7 @@ using namespace vulkan_internal; commandlist.GetCommandBuffer(), internal_state_src->staging_resource, internal_state_dst->resource, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + _ConvertImageLayout(ResourceState::COPY_DST), 1, © ); @@ -8473,7 +8524,7 @@ using namespace vulkan_internal; vkCmdCopyImageToBuffer( commandlist.GetCommandBuffer(), internal_state_src->resource, - VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + _ConvertImageLayout(ResourceState::COPY_SRC), internal_state_dst->staging_resource, 1, © @@ -8536,8 +8587,8 @@ using namespace vulkan_internal; copy.dstSubresource.mipLevel = 0; vkCmdCopyImage(commandlist.GetCommandBuffer(), - internal_state_src->resource, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, - internal_state_dst->resource, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + internal_state_src->resource, _ConvertImageLayout(ResourceState::COPY_SRC), + internal_state_dst->resource, _ConvertImageLayout(ResourceState::COPY_DST), 1, © ); } @@ -8633,9 +8684,9 @@ using namespace vulkan_internal; vkCmdCopyImage( commandlist.GetCommandBuffer(), src_internal->resource, - VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + _ConvertImageLayout(ResourceState::COPY_SRC), dst_internal->resource, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + _ConvertImageLayout(ResourceState::COPY_DST), 1, © ); diff --git a/WickedEngine/wiGraphicsDevice_Vulkan.h b/WickedEngine/wiGraphicsDevice_Vulkan.h index 0d19a05ad..09283f2b4 100644 --- a/WickedEngine/wiGraphicsDevice_Vulkan.h +++ b/WickedEngine/wiGraphicsDevice_Vulkan.h @@ -120,6 +120,8 @@ namespace wi::graphics bool sparse_binding_supported = false; std::shared_ptr locker; + void signal(VkSemaphore semaphore); + void wait(VkSemaphore semaphore); void submit(GraphicsDevice_Vulkan* device, VkFence fence); } queues[QUEUE_COUNT]; @@ -193,17 +195,40 @@ namespace wi::graphics void reset(); }; + wi::vector semaphore_pool; + std::mutex semaphore_pool_locker; + VkSemaphore new_semaphore() + { + std::scoped_lock lck(semaphore_pool_locker); + if (semaphore_pool.empty()) + { + VkSemaphore& sema = semaphore_pool.emplace_back(); + VkSemaphoreCreateInfo info = {}; + info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + VkResult res = vkCreateSemaphore(device, &info, nullptr, &sema); + assert(res == VK_SUCCESS); + } + VkSemaphore semaphore = semaphore_pool.back(); + semaphore_pool.pop_back(); + return semaphore; + } + void free_semaphore(VkSemaphore semaphore) + { + std::scoped_lock lck(semaphore_pool_locker); + semaphore_pool.push_back(semaphore); + } + struct CommandList_Vulkan { - VkSemaphore semaphore = VK_NULL_HANDLE; VkCommandPool commandPools[BUFFERCOUNT][QUEUE_COUNT] = {}; VkCommandBuffer commandBuffers[BUFFERCOUNT][QUEUE_COUNT] = {}; uint32_t buffer_index = 0; QUEUE_TYPE queue = {}; uint32_t id = 0; - wi::vector waits; - std::atomic_bool waited_on{ false }; + wi::vector> wait_queues; + wi::vector waits; + wi::vector signals; DescriptorBinder binder; DescriptorBinderPool binder_pools[BUFFERCOUNT]; @@ -229,7 +254,9 @@ namespace wi::graphics void reset(uint32_t bufferindex) { buffer_index = bufferindex; + wait_queues.clear(); waits.clear(); + signals.clear(); binder_pools[buffer_index].reset(); binder.reset(); frame_allocators[buffer_index].reset(); @@ -370,6 +397,7 @@ namespace wi::graphics ///////////////Thread-sensitive//////////////////////// void WaitCommandList(CommandList cmd, CommandList wait_for) override; + void WaitQueue(CommandList cmd, QUEUE_TYPE wait_for) override; void RenderPassBegin(const SwapChain* swapchain, CommandList cmd) override; void RenderPassBegin(const RenderPassImage* images, uint32_t image_count, CommandList cmd, RenderPassFlags flags = RenderPassFlags::NONE) override; void RenderPassEnd(CommandList cmd) override; diff --git a/WickedEngine/wiOcean.cpp b/WickedEngine/wiOcean.cpp index 5d22ebcff..06dce6d55 100644 --- a/WickedEngine/wiOcean.cpp +++ b/WickedEngine/wiOcean.cpp @@ -187,6 +187,7 @@ namespace wi SubresourceData initdata; initdata.data_ptr = displacementdata.data(); initdata.row_pitch = tex_desc.width * sizeof(XMFLOAT4); + tex_desc.layout = ResourceState::COPY_SRC | ResourceState::SHADER_RESOURCE_COMPUTE; device->CreateTexture(&tex_desc, &initdata, &displacementMap); device->SetName(&displacementMap, "displacementMap"); @@ -429,16 +430,18 @@ namespace wi wi::renderer::GenerateMipChain(gradientMap, wi::renderer::MIPGENFILTER_LINEAR, cmd); - // Copy displacement map to readback: - device->Barrier(GPUBarrier::Image(&displacementMap, displacementMap.desc.layout, ResourceState::COPY_SRC), cmd); - device->CopyResource(&displacementMap_readback[displacement_readback_index], &displacementMap, cmd); - displacement_readback_valid[displacement_readback_index] = true; - displacement_readback_index = (displacement_readback_index + 1) % device->GetBufferCount(); - device->Barrier(GPUBarrier::Image(&displacementMap, ResourceState::COPY_SRC, displacementMap.desc.layout), cmd); - device->EventEnd(cmd); } + void Ocean::CopyDisplacementMapReadback(wi::graphics::CommandList cmd) const + { + GraphicsDevice* device = wi::graphics::GetDevice(); + device->EventBegin("Ocean Readback Copy", cmd); + device->CopyResource(&displacementMap_readback[displacement_readback_index], &displacementMap, cmd); + displacement_readback_valid[displacement_readback_index] = true; + displacement_readback_index = (displacement_readback_index + 1) % device->GetBufferCount(); + device->EventEnd(cmd); + } void Ocean::Render(const CameraComponent& camera, CommandList cmd) const { diff --git a/WickedEngine/wiOcean.h b/WickedEngine/wiOcean.h index 47b00746b..a4f7157a5 100644 --- a/WickedEngine/wiOcean.h +++ b/WickedEngine/wiOcean.h @@ -43,6 +43,8 @@ namespace wi void UpdateDisplacementMap(wi::graphics::CommandList cmd) const; void Render(const wi::scene::CameraComponent& camera, wi::graphics::CommandList cmd) const; + void CopyDisplacementMapReadback(wi::graphics::CommandList cmd) const; + const wi::graphics::Texture* getDisplacementMap() const; const wi::graphics::Texture* getGradientMap() const; diff --git a/WickedEngine/wiRenderPath3D.cpp b/WickedEngine/wiRenderPath3D.cpp index a63c210b2..91492dee7 100644 --- a/WickedEngine/wiRenderPath3D.cpp +++ b/WickedEngine/wiRenderPath3D.cpp @@ -800,6 +800,7 @@ namespace wi if (scene->terrains.GetCount() > 0) { cmd_copypages = device->BeginCommandList(QUEUE_COPY); + device->WaitQueue(cmd_copypages, QUEUE_GRAPHICS); // sync to prev frame graphics wi::jobsystem::Execute(ctx, [this, cmd_copypages](wi::jobsystem::JobArgs args) { for (size_t i = 0; i < scene->terrains.GetCount(); ++i) { @@ -810,6 +811,7 @@ namespace wi // Preparing the frame: CommandList cmd = device->BeginCommandList(); + device->WaitQueue(cmd, QUEUE_COMPUTE); // sync to prev frame compute (disallow prev frame overlapping a compute task into updating global scene resources for this frame) CommandList cmd_prepareframe = cmd; wi::renderer::ProcessDeferredTextureRequests(cmd); // Execute it first thing in the frame here, on main thread, to not allow other thread steal it and execute on different command list! wi::jobsystem::Execute(ctx, [this, cmd](wi::jobsystem::JobArgs args) { @@ -837,7 +839,7 @@ namespace wi }); - // async compute parallel with depth prepass + // async compute parallel with depth prepass cmd = device->BeginCommandList(QUEUE_COMPUTE); CommandList cmd_prepareframe_async = cmd; device->WaitCommandList(cmd, cmd_prepareframe); @@ -1111,13 +1113,26 @@ namespace wi }); + CommandList cmd_ocean; + if (scene->weather.IsOceanEnabled() && scene->ocean.IsValid()) + { + // Ocean simulation can be updated async to opaque passes: + cmd_ocean = device->BeginCommandList(QUEUE_COMPUTE); + wi::renderer::UpdateOcean(visibility_main, cmd_ocean); + + // Copying to readback is done on copy queue to use DMA instead of compute warps: + CommandList cmd_oceancopy = device->BeginCommandList(QUEUE_COPY); + device->WaitCommandList(cmd_oceancopy, cmd_ocean); + wi::renderer::ReadbackOcean(visibility_main, cmd_oceancopy); + } + // Shadow maps: if (getShadowsEnabled()) { cmd = device->BeginCommandList(); wi::jobsystem::Execute(ctx, [this, cmd](wi::jobsystem::JobArgs args) { wi::renderer::DrawShadowmaps(visibility_main, cmd); - }); + }); } if (wi::renderer::GetVXGIEnabled() && getSceneUpdateEnabled()) @@ -1331,12 +1346,49 @@ namespace wi }); } - if (scene->weather.IsOceanEnabled()) + // Main camera weather compute effects depending on shadow maps, envmaps, etc, but don't depend on async surface pass: + if (scene->weather.IsRealisticSky() || scene->weather.IsVolumetricClouds()) { - // Ocean simulation can be updated async to opaque passes: - CommandList cmd_ocean = device->BeginCommandList(QUEUE_COMPUTE); - device->WaitCommandList(cmd_ocean, cmd); - wi::renderer::UpdateOcean(visibility_main, cmd_ocean); + cmd = device->BeginCommandList(); + wi::jobsystem::Execute(ctx, [this, cmd](wi::jobsystem::JobArgs args) { + + wi::renderer::BindCameraCB( + *camera, + camera_previous, + camera_reflection, + cmd + ); + + if (scene->weather.IsRealisticSky()) + { + wi::renderer::ComputeSkyAtmosphereSkyViewLut(cmd); + + if (scene->weather.IsRealisticSkyAerialPerspective()) + { + wi::renderer::ComputeSkyAtmosphereCameraVolumeLut(cmd); + } + } + if (scene->weather.IsRealisticSky() && scene->weather.IsRealisticSkyAerialPerspective()) + { + wi::renderer::Postprocess_AerialPerspective( + aerialperspectiveResources, + cmd + ); + } + if (scene->weather.IsVolumetricClouds()) + { + wi::renderer::Postprocess_VolumetricClouds( + volumetriccloudResources, + cmd, + *camera, + camera_previous, + camera_reflection, + wi::renderer::GetTemporalAAEnabled() || getFSR2Enabled(), + scene->weather.volumetricCloudsWeatherMapFirst.IsValid() ? &scene->weather.volumetricCloudsWeatherMapFirst.GetTexture() : nullptr, + scene->weather.volumetricCloudsWeatherMapSecond.IsValid() ? &scene->weather.volumetricCloudsWeatherMapSecond.GetTexture() : nullptr + ); + } + }); } // Main camera opaque color pass: @@ -1354,17 +1406,6 @@ namespace wi cmd ); - // This can't run in "main camera compute effects" async compute, - // because it depends on shadow maps, and envmaps - if (scene->weather.IsRealisticSky()) - { - wi::renderer::ComputeSkyAtmosphereSkyViewLut(cmd); - - if (scene->weather.IsRealisticSkyAerialPerspective()) - { - wi::renderer::ComputeSkyAtmosphereCameraVolumeLut(cmd); - } - } if (getRaytracedReflectionEnabled()) { wi::renderer::Postprocess_RTReflection( @@ -1395,26 +1436,6 @@ namespace wi cmd ); } - if (scene->weather.IsRealisticSky() && scene->weather.IsRealisticSkyAerialPerspective()) - { - wi::renderer::Postprocess_AerialPerspective( - aerialperspectiveResources, - cmd - ); - } - if (scene->weather.IsVolumetricClouds()) - { - wi::renderer::Postprocess_VolumetricClouds( - volumetriccloudResources, - cmd, - *camera, - camera_previous, - camera_reflection, - wi::renderer::GetTemporalAAEnabled() || getFSR2Enabled(), - scene->weather.volumetricCloudsWeatherMapFirst.IsValid() ? &scene->weather.volumetricCloudsWeatherMapFirst.GetTexture() : nullptr, - scene->weather.volumetricCloudsWeatherMapSecond.IsValid() ? &scene->weather.volumetricCloudsWeatherMapSecond.GetTexture() : nullptr - ); - } // Depth buffers were created on COMPUTE queue, so make them available for pixel shaders here: { @@ -1593,6 +1614,10 @@ namespace wi // Transparents, post processes, etc: cmd = device->BeginCommandList(); + if (cmd_ocean.IsValid()) + { + device->WaitCommandList(cmd, cmd_ocean); + } wi::jobsystem::Execute(ctx, [this, cmd](wi::jobsystem::JobArgs args) { GraphicsDevice* device = wi::graphics::GetDevice(); @@ -1613,8 +1638,6 @@ namespace wi RenderTransparents(cmd); - RenderPostprocessChain(cmd); - // Depth buffers expect a non-pixel shader resource state as they are generated on compute queue: { GPUBarrier barriers[] = { @@ -1624,20 +1647,24 @@ namespace wi }; device->Barrier(barriers, arraysize(barriers), cmd); } - - wi::renderer::TextureStreamingReadbackCopy(*scene, cmd); }); if (scene->IsWetmapProcessingRequired()) { - CommandList cmd_wetmaps = device->BeginCommandList(QUEUE_COMPUTE); - device->WaitCommandList(cmd_wetmaps, cmd); // wait for transparents, it will be scheduled with late frame (GUI, etc) + CommandList wetmap_cmd = device->BeginCommandList(QUEUE_COMPUTE); + device->WaitCommandList(wetmap_cmd, cmd); // wait for transparents, it will be scheduled with late frame (GUI, etc) // Note: GPU processing of this compute task can overlap with beginning of the next frame because no one is waiting for it - wi::jobsystem::Execute(ctx, [this, cmd_wetmaps](wi::jobsystem::JobArgs args) { - wi::renderer::RefreshWetmaps(*scene, cmd_wetmaps); + wi::jobsystem::Execute(ctx, [this, wetmap_cmd](wi::jobsystem::JobArgs args) { + wi::renderer::RefreshWetmaps(visibility_main, wetmap_cmd); }); } + cmd = device->BeginCommandList(); + wi::jobsystem::Execute(ctx, [this, cmd](wi::jobsystem::JobArgs args) { + RenderPostprocessChain(cmd); + wi::renderer::TextureStreamingReadbackCopy(*scene, cmd); + }); + RenderPath2D::Render(); wi::jobsystem::Wait(ctx); @@ -1995,6 +2022,7 @@ namespace wi ); // Note: volumetrics and light shafts are blended before transparent scene, because they used depth of the opaques + // But the ocean is special, because it does have depth for them implicitly computed from ocean plane if (getVolumeLightsEnabled() && visibility_main.IsRequestedVolumetricLights()) { @@ -2131,6 +2159,9 @@ namespace wi { GraphicsDevice* device = wi::graphics::GetDevice(); + wi::renderer::BindCommonResources(cmd); + wi::renderer::BindCameraCB(*camera, camera_previous, camera_reflection, cmd); + const Texture* rt_first = nullptr; // not ping-ponged with read / write const Texture* rt_read = &rtMain; const Texture* rt_write = &rtPostprocess; diff --git a/WickedEngine/wiRenderer.cpp b/WickedEngine/wiRenderer.cpp index 4938d2d99..9e97ee6ff 100644 --- a/WickedEngine/wiRenderer.cpp +++ b/WickedEngine/wiRenderer.cpp @@ -4570,6 +4570,18 @@ void UpdateRenderDataAsync( BindCommonResources(cmd); + // Wetmaps will be initialized: + for (uint32_t objectIndex = 0; objectIndex < vis.scene->objects.GetCount(); ++objectIndex) + { + const ObjectComponent& object = vis.scene->objects[objectIndex]; + if (!object.wetmap.IsValid() || object.wetmap_cleared) + continue; + device->ClearUAV(&object.wetmap, 0, cmd); + object.wetmap_cleared = true; + barrier_stack.push_back(GPUBarrier::Buffer(&object.wetmap, ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE_COMPUTE)); + } + barrier_stack_flush(cmd); + // Precompute static volumetric cloud textures: if (!volumetric_clouds_precomputed && vis.scene->weather.IsVolumetricClouds()) { @@ -4766,6 +4778,8 @@ void UpdateOcean( CommandList cmd ) { + if (!vis.scene->weather.IsOceanEnabled() || !vis.scene->ocean.IsValid()) + return; bool occluded = false; if (vis.flags & wi::renderer::Visibility::ALLOW_OCCLUSION_CULLING) { @@ -4779,6 +4793,23 @@ void UpdateOcean( wi::profiler::EndRange(range); } } +void ReadbackOcean( + const Visibility& vis, + CommandList cmd +) +{ + if (!vis.scene->weather.IsOceanEnabled() || !vis.scene->ocean.IsValid()) + return; + bool occluded = false; + if (vis.flags & wi::renderer::Visibility::ALLOW_OCCLUSION_CULLING) + { + occluded = vis.scene->ocean.IsOccluded(); + } + if (!occluded) + { + vis.scene->ocean.CopyDisplacementMapReadback(cmd); + } +} void UpdateRaytracingAccelerationStructures(const Scene& scene, CommandList cmd) { @@ -6193,7 +6224,7 @@ void DrawScene( BindCommonResources(cmd); - if (ocean && !skip_planar_reflection_objects && vis.scene->weather.IsOceanEnabled()) + if (ocean && !skip_planar_reflection_objects && vis.scene->weather.IsOceanEnabled() && vis.scene->ocean.IsValid()) { if (!occlusion || !vis.scene->ocean.IsOccluded()) { @@ -9906,9 +9937,9 @@ void RefreshLightmaps(const Scene& scene, CommandList cmd) } } -void RefreshWetmaps(const Scene& scene, CommandList cmd) +void RefreshWetmaps(const Visibility& vis, CommandList cmd) { - if (!scene.IsWetmapProcessingRequired()) + if (!vis.scene->IsWetmapProcessingRequired()) return; device->EventBegin("RefreshWetmaps", cmd); @@ -9916,43 +9947,44 @@ void RefreshWetmaps(const Scene& scene, CommandList cmd) BindCommonResources(cmd); device->BindComputeShader(&shaders[CSTYPE_WETMAP_UPDATE], cmd); - for (uint32_t objectIndex = 0; objectIndex < scene.objects.GetCount(); ++objectIndex) + WetmapPush push = {}; + push.rain_amount = vis.scene->weather.rain_amount; + + // Note: every object wetmap is updated, not just visible + for (uint32_t objectIndex = 0; objectIndex < vis.scene->objects.GetCount(); ++objectIndex) { - const ObjectComponent& object = scene.objects[objectIndex]; + const ObjectComponent& object = vis.scene->objects[objectIndex]; if (!object.wetmap.IsValid()) continue; uint32_t vertexCount = uint32_t(object.wetmap.desc.size / GetFormatStride(object.wetmap.desc.format)); - WetmapPush push = {}; push.wetmap = device->GetDescriptorIndex(&object.wetmap, SubresourceType::UAV); if (push.wetmap < 0) continue; push.instanceID = objectIndex; - push.rain_amount = scene.weather.rain_amount; device->PushConstants(&push, sizeof(push), cmd); device->Dispatch((vertexCount + 63u) / 64u, 1, 1, cmd); } - for (uint32_t hairIndex = 0; hairIndex < scene.hairs.GetCount(); ++hairIndex) + // Note: only visible hair particles will be updated, becasue invisible ones will not have valid vertices + for (uint32_t hairIndex : vis.visibleHairs) { - const wi::HairParticleSystem& hair = scene.hairs[hairIndex]; + const wi::HairParticleSystem& hair = vis.scene->hairs[hairIndex]; if (!hair.wetmap.IsValid()) continue; uint32_t vertexCount = uint32_t(hair.wetmap.size / sizeof(uint16_t)); - WetmapPush push = {}; push.wetmap = hair.wetmap.descriptor_uav; if (push.wetmap < 0) continue; - push.instanceID = uint32_t(scene.objects.GetCount() + hairIndex); - push.rain_amount = scene.weather.rain_amount; + push.instanceID = uint32_t(vis.scene->objects.GetCount() + hairIndex); device->PushConstants(&push, sizeof(push), cmd); device->Dispatch((vertexCount + 63u) / 64u, 1, 1, cmd); @@ -16908,7 +16940,6 @@ void Postprocess_Downsample4x( { GPUBarrier barriers[] = { - GPUBarrier::Memory(), GPUBarrier::Image(&output, ResourceState::UNORDERED_ACCESS, output.desc.layout), }; device->Barrier(barriers, arraysize(barriers), cmd); diff --git a/WickedEngine/wiRenderer.h b/WickedEngine/wiRenderer.h index f57df18bb..6cbf30a95 100644 --- a/WickedEngine/wiRenderer.h +++ b/WickedEngine/wiRenderer.h @@ -205,6 +205,11 @@ namespace wi::renderer const Visibility& vis, wi::graphics::CommandList cmd ); + // Readback the ocean, can be on async compute or async copy + void ReadbackOcean( + const Visibility& vis, + wi::graphics::CommandList cmd + ); void UpdateRaytracingAccelerationStructures(const wi::scene::Scene& scene, wi::graphics::CommandList cmd); @@ -312,7 +317,7 @@ namespace wi::renderer // Call once per frame to render lightmaps void RefreshLightmaps(const wi::scene::Scene& scene, wi::graphics::CommandList cmd); // Call once per frame to render wetmaps - void RefreshWetmaps(const wi::scene::Scene& scene, wi::graphics::CommandList cmd); + void RefreshWetmaps(const Visibility& vis, wi::graphics::CommandList cmd); // Run a compute shader that will resolve a MSAA depth buffer to a single-sample texture void ResolveMSAADepthBuffer(const wi::graphics::Texture& dst, const wi::graphics::Texture& src, wi::graphics::CommandList cmd); void DownsampleDepthBuffer(const wi::graphics::Texture& src, wi::graphics::CommandList cmd); diff --git a/WickedEngine/wiScene.cpp b/WickedEngine/wiScene.cpp index f0beb2422..1baaefaf8 100644 --- a/WickedEngine/wiScene.cpp +++ b/WickedEngine/wiScene.cpp @@ -1036,6 +1036,7 @@ namespace wi::scene surfelgi = {}; ddgi = {}; + ocean = {}; aabb_objects.clear(); aabb_lights.clear(); @@ -4053,10 +4054,9 @@ namespace wi::scene desc.size = mesh.vertex_positions.size() * sizeof(uint16_t); desc.format = Format::R16_UNORM; desc.bind_flags = BindFlag::SHADER_RESOURCE | BindFlag::UNORDERED_ACCESS; - wi::vector zeroes(desc.size); - std::fill(zeroes.begin(), zeroes.end(), 0); - device->CreateBuffer(&desc, zeroes.data(), &object.wetmap); + device->CreateBuffer(&desc, nullptr, &object.wetmap); device->SetName(&object.wetmap, "wetmap"); + object.wetmap_cleared = false; } else if(!object.IsWetmapEnabled() && object.wetmap.IsValid()) { @@ -4759,6 +4759,10 @@ namespace wi::scene { OceanRegenerate(); } + if (!weather.IsOceanEnabled()) + { + ocean = {}; + } // Ocean occlusion status: if (!wi::renderer::GetFreezeCullingCameraEnabled() && weather.IsOceanEnabled()) diff --git a/WickedEngine/wiScene_Components.h b/WickedEngine/wiScene_Components.h index 770b07e18..afa8d2d0e 100644 --- a/WickedEngine/wiScene_Components.h +++ b/WickedEngine/wiScene_Components.h @@ -850,6 +850,7 @@ namespace wi::scene wi::graphics::GPUBuffer vb_ao; int vb_ao_srv = -1; wi::graphics::GPUBuffer wetmap; + mutable bool wetmap_cleared = false; XMFLOAT3 center = XMFLOAT3(0, 0, 0); float radius = 0;