From b78030b02fc4e0a288423400ec7e91e47e757644 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tur=C3=A1nszki=20J=C3=A1nos?=
 <turanszkij@users.noreply.github.com>
Date: Fri, 19 Jul 2024 06:56:14 +0200
Subject: [PATCH] async queue updates (#885)

---
 .github/workflows/build-nightly.yml      |  15 +--
 WickedEngine/shaders/ShaderInterop.h     |   5 +-
 WickedEngine/wiGraphicsDevice.h          |   8 ++
 WickedEngine/wiGraphicsDevice_DX12.cpp   | 164 ++++++++++++-----------
 WickedEngine/wiGraphicsDevice_DX12.h     |  45 ++++++-
 WickedEngine/wiGraphicsDevice_Vulkan.cpp | 119 +++++++++++-----
 WickedEngine/wiGraphicsDevice_Vulkan.h   |  34 ++++-
 WickedEngine/wiOcean.cpp                 |  17 ++-
 WickedEngine/wiOcean.h                   |   2 +
 WickedEngine/wiRenderPath3D.cpp          | 123 ++++++++++-------
 WickedEngine/wiRenderer.cpp              |  57 ++++++--
 WickedEngine/wiRenderer.h                |   7 +-
 WickedEngine/wiScene.cpp                 |  10 +-
 WickedEngine/wiScene_Components.h        |   1 +
 14 files changed, 403 insertions(+), 204 deletions(-)

diff --git a/.github/workflows/build-nightly.yml b/.github/workflows/build-nightly.yml
index 98332a3db..0c206627a 100644
--- a/.github/workflows/build-nightly.yml
+++ b/.github/workflows/build-nightly.yml
@@ -51,24 +51,17 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
-    - uses: actions/cache@v4
-      with:
-        path: ~/.cache/ccache
-        key: ccache-${{ github.run_id }}
-        restore-keys: ccache
-        save-always: true
-
     - name: Install dependencies
       run: |
         sudo apt update
-        sudo apt install libsdl2-dev ccache
+        sudo apt install libsdl2-dev
 
     - name: Initial compile
       run: |
         mkdir build
         cd build
-        cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        CCACHE_NODIRECT=1 make -j$(nproc)
+        cmake .. -DCMAKE_BUILD_TYPE=Release
+        make -j$(nproc)
       
     - name: Generate shader dump
       run: |
@@ -79,7 +72,7 @@ jobs:
     - name: Recompile with shader dump
       run: |
         cd build
-        CCACHE_NODIRECT=1 make -B -j $(nproc)
+        make -B -j $(nproc)
         
     - name: Move files
       run: |
diff --git a/WickedEngine/shaders/ShaderInterop.h b/WickedEngine/shaders/ShaderInterop.h
index 6cfaee761..16e8cffbb 100644
--- a/WickedEngine/shaders/ShaderInterop.h
+++ b/WickedEngine/shaders/ShaderInterop.h
@@ -132,13 +132,12 @@ static const uint IndirectDispatchArgsAlignment = 4u;
 #define CBSLOT_OTHER_EMITTEDPARTICLE			4
 #define CBSLOT_OTHER_HAIRPARTICLE				4
 #define CBSLOT_OTHER_FFTGENERATOR				4
-#define CBSLOT_OTHER_OCEAN_SIMULATION_IMMUTABLE	4
-#define CBSLOT_OTHER_OCEAN_SIMULATION_PERFRAME	5
-#define CBSLOT_OTHER_OCEAN_RENDER				7
+#define CBSLOT_OTHER_OCEAN						4
 #define CBSLOT_OTHER_CLOUDGENERATOR				4
 #define CBSLOT_OTHER_GPUSORTLIB					4
 #define CBSLOT_MSAO								4
 #define CBSLOT_FSR								4
+#define CBSLOT_TRAILRENDERER					4
 #endif // !__PSSL__ && !__SCE__
 
 #endif // WI_SHADERINTEROP_H
diff --git a/WickedEngine/wiGraphicsDevice.h b/WickedEngine/wiGraphicsDevice.h
index cd1005d00..c02ee38f4 100644
--- a/WickedEngine/wiGraphicsDevice.h
+++ b/WickedEngine/wiGraphicsDevice.h
@@ -131,12 +131,14 @@ namespace wi::graphics
 		// Returns whether the graphics debug layer is enabled. It can be enabled when creating the device.
 		constexpr bool IsDebugDevice() const { return validationMode != ValidationMode::Disabled; }
 
+		// Get GPU-specific metrics:
 		constexpr size_t GetShaderIdentifierSize() const { return SHADER_IDENTIFIER_SIZE; }
 		constexpr size_t GetTopLevelAccelerationStructureInstanceSize() const { return TOPLEVEL_ACCELERATION_STRUCTURE_INSTANCE_SIZE; }
 		constexpr uint32_t GetVariableRateShadingTileSize() const { return VARIABLE_RATE_SHADING_TILE_SIZE; }
 		constexpr uint64_t GetTimestampFrequency() const { return TIMESTAMP_FREQUENCY; }
 		constexpr uint64_t GetVideoDecodeBitstreamAlignment() const { return VIDEO_DECODE_BITSTREAM_ALIGNMENT; }
 
+		// Get information about the graphics device manufacturer:
 		constexpr uint32_t GetVendorId() const { return vendorId; }
 		constexpr uint32_t GetDeviceId() const { return deviceId; }
 		constexpr const std::string& GetAdapterName() const { return adapterName; }
@@ -178,7 +180,13 @@ namespace wi::graphics
 		//	- These commands are not immediately executed, but they begin executing on the GPU after calling SubmitCommandLists()
 		//	- These are not thread safe, only a single thread should use a single CommandList at one time
 
+		// Tell the command list to wait for an other command list which was started before it
+		//	The granularity of this is at least that the beginning of the command list will wait for the end of the other command list
+		//	On some platform like PS5 this can be implemented by waiting exactly at the wait insertion point within the command lists which is more precise
 		virtual void WaitCommandList(CommandList cmd, CommandList wait_for) = 0;
+		// Tell the command list to wait for the specified queue to finish processing
+		//	It is useful when you want to wait for a previous frame, or just don't know which command list to wait for
+		virtual void WaitQueue(CommandList cmd, QUEUE_TYPE wait_for) = 0;
 		virtual void RenderPassBegin(const SwapChain* swapchain, CommandList cmd) = 0;
 		virtual void RenderPassBegin(const RenderPassImage* images, uint32_t image_count, CommandList cmd, RenderPassFlags flags = RenderPassFlags::NONE) = 0;
 		virtual void RenderPassEnd(CommandList cmd) = 0;
diff --git a/WickedEngine/wiGraphicsDevice_DX12.cpp b/WickedEngine/wiGraphicsDevice_DX12.cpp
index 21d43c397..afbdef221 100644
--- a/WickedEngine/wiGraphicsDevice_DX12.cpp
+++ b/WickedEngine/wiGraphicsDevice_DX12.cpp
@@ -1610,7 +1610,38 @@ namespace dx12_internal
 }
 using namespace dx12_internal;
 
-	
+#ifdef PLATFORM_XBOX
+std::mutex queue_locker;
+#endif // PLATFORM_XBOX
+
+	void GraphicsDevice_DX12::CommandQueue::signal(const Semaphore& semaphore)
+	{
+		if (queue == nullptr)
+			return;
+		HRESULT hr = queue->Signal(semaphore.fence.Get(), semaphore.fenceValue);
+		assert(SUCCEEDED(hr));
+	}
+	void GraphicsDevice_DX12::CommandQueue::wait(const Semaphore& semaphore)
+	{
+		if (queue == nullptr)
+			return;
+		HRESULT hr = queue->Wait(semaphore.fence.Get(), semaphore.fenceValue);
+		assert(SUCCEEDED(hr));
+	}
+	void GraphicsDevice_DX12::CommandQueue::submit()
+	{
+		if (queue == nullptr)
+			return;
+		if (submit_cmds.empty())
+			return;
+
+		queue->ExecuteCommandLists(
+			(UINT)submit_cmds.size(),
+			submit_cmds.data()
+		);
+
+		submit_cmds.clear();
+	}
 
 	void GraphicsDevice_DX12::CopyAllocator::init(GraphicsDevice_DX12* device)
 	{
@@ -2461,15 +2492,6 @@ using namespace dx12_internal;
 			}
 			hr = queues[QUEUE_GRAPHICS].queue->SetName(L"QUEUE_GRAPHICS");
 			assert(SUCCEEDED(hr));
-			hr = device->CreateFence(0, D3D12_FENCE_FLAG_NONE, PPV_ARGS(queues[QUEUE_GRAPHICS].fence));
-			assert(SUCCEEDED(hr));
-			if (FAILED(hr))
-			{
-				std::stringstream ss("");
-				ss << "ID3D12Device::CreateFence[QUEUE_GRAPHICS] failed! ERROR: 0x" << std::hex << hr;
-				wi::helper::messageBox(ss.str(), "Error!");
-				wi::platform::Exit();
-			}
 		}
 
 		{
@@ -2488,15 +2510,6 @@ using namespace dx12_internal;
 			}
 			hr = queues[QUEUE_COMPUTE].queue->SetName(L"QUEUE_COMPUTE");
 			assert(SUCCEEDED(hr));
-			hr = device->CreateFence(0, D3D12_FENCE_FLAG_NONE, PPV_ARGS(queues[QUEUE_COMPUTE].fence));
-			assert(SUCCEEDED(hr));
-			if (FAILED(hr))
-			{
-				std::stringstream ss("");
-				ss << "ID3D12Device::CreateFence[QUEUE_COMPUTE] failed! ERROR: 0x" << std::hex << hr;
-				wi::helper::messageBox(ss.str(), "Error!");
-				wi::platform::Exit();
-			}
 		}
 
 		{
@@ -2515,15 +2528,6 @@ using namespace dx12_internal;
 			}
 			hr = queues[QUEUE_COPY].queue->SetName(L"QUEUE_COPY");
 			assert(SUCCEEDED(hr));
-			hr = device->CreateFence(0, D3D12_FENCE_FLAG_NONE, PPV_ARGS(queues[QUEUE_COPY].fence));
-			assert(SUCCEEDED(hr));
-			if (FAILED(hr))
-			{
-				std::stringstream ss("");
-				ss << "ID3D12Device::CreateFence[QUEUE_COPY] failed! ERROR: 0x" << std::hex << hr;
-				wi::helper::messageBox(ss.str(), "Error!");
-				wi::platform::Exit();
-			}
 		}
 
 		if (SUCCEEDED(device.As(&video_device)))
@@ -2539,15 +2543,6 @@ using namespace dx12_internal;
 				capabilities |= GraphicsDeviceCapability::VIDEO_DECODE_H264;
 				hr = queues[QUEUE_VIDEO_DECODE].queue->SetName(L"QUEUE_VIDEO_DECODE");
 				assert(SUCCEEDED(hr));
-				hr = device->CreateFence(0, D3D12_FENCE_FLAG_NONE, PPV_ARGS(queues[QUEUE_VIDEO_DECODE].fence));
-				assert(SUCCEEDED(hr));
-				if (FAILED(hr))
-				{
-					std::stringstream ss("");
-					ss << "ID3D12Device::CreateFence[QUEUE_VIDEO_DECODE] failed! ERROR: 0x" << std::hex << hr;
-					wi::helper::messageBox(ss.str(), "Error!");
-					wi::platform::Exit();
-				}
 			}
 		}
 
@@ -3552,10 +3547,11 @@ using namespace dx12_internal;
 		{
 			resourcedesc.Flags |= D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS;
 		}
-		if (has_flag(desc->misc_flags, ResourceMiscFlag::VIDEO_DECODE))
+		if (!has_flag(desc->bind_flags, BindFlag::DEPTH_STENCIL) && resourcedesc.SampleDesc.Count <= 1)
 		{
-			// Because video queue can only transition from/to VIDEO_ and COMMON states, we will use COMMON internally and rely on implicit transition for DPB textures
-			//	(See how the resource barrier on video queue overrides any user specified state into COMMON)
+			// The copy and video queues have much stricter requirements to supported resource states, but they support
+			//	implicit promotion from COMMON state. Because user is not allowed to set resource to COMMON state, we use this flag
+			//	so textures automatically decay to COMMON state at the queue submit when they are left in a read-only state
 			resourcedesc.Flags |= D3D12_RESOURCE_FLAG_ALLOW_SIMULTANEOUS_ACCESS;
 		}
 
@@ -5261,7 +5257,6 @@ using namespace dx12_internal;
 		commandlist.reset(GetBufferIndex());
 		commandlist.queue = queue;
 		commandlist.id = cmd_current;
-		commandlist.waited_on.store(false);
 
 		if (commandlist.GetCommandList() == nullptr)
 		{
@@ -5382,38 +5377,56 @@ using namespace dx12_internal;
 				assert(SUCCEEDED(hr));
 
 				CommandQueue& queue = queues[commandlist.queue];
+				const bool dependency = !commandlist.signals.empty() || !commandlist.waits.empty() || !commandlist.wait_queues.empty();
+
+				if (dependency)
+				{
+					// If the current commandlist must resolve a dependency, then previous ones will be submitted before doing that:
+					//	This improves GPU utilization because not the whole batch of command lists will need to synchronize, but only the one that handles it
+					queue.submit();
+				}
+
 				queue.submit_cmds.push_back(commandlist.GetCommandList());
 
-				if (commandlist.waited_on.load() || !commandlist.waits.empty())
+				if (dependency)
 				{
-					for (auto& wait : commandlist.waits)
+					for (auto& wait : commandlist.wait_queues)
 					{
-						// record wait for signal on a previous submit:
-						const CommandList_DX12& waitcommandlist = GetCommandList(wait);
-						hr = queue.queue->Wait(
-							queues[waitcommandlist.queue].fence.Get(),
-							FRAMECOUNT * commandlists.size() + (uint64_t)waitcommandlist.id
-						);
-						assert(SUCCEEDED(hr));
-					}
+						CommandQueue& waitqueue = queues[wait.first];
+						const Semaphore& semaphore = wait.second;
 
-					if (!queue.submit_cmds.empty())
-					{
-						queue.queue->ExecuteCommandLists(
-							(UINT)queue.submit_cmds.size(),
-							queue.submit_cmds.data()
-						);
-						queue.submit_cmds.clear();
-					}
+						// The WaitQueue operation will submit and signal the specified dependency queue:
+						waitqueue.submit();
+						waitqueue.signal(semaphore); // signals immediately after submit
 
-					if (commandlist.waited_on.load())
-					{
-						hr = queue.queue->Signal(
-							queue.fence.Get(),
-							FRAMECOUNT * commandlists.size() + (uint64_t)commandlist.id
-						);
-						assert(SUCCEEDED(hr));
+						// The current queue will be waiting for the dependency queue to complete:
+						queue.wait(semaphore);
+
+						// recycle semaphore:
+						free_semaphore(semaphore);
 					}
+					commandlist.wait_queues.clear();
+
+					for(auto& semaphore : commandlist.waits)
+					{
+						// Wait for command list dependency:
+						queue.wait(semaphore);
+
+						// semaphore is not recycled here, only the signals recycle themselves vecause wait will use the same
+					}
+					commandlist.waits.clear();
+
+					queue.submit();
+
+					for(auto& semaphore : commandlist.signals)
+					{
+						// Signal this command list's completion:
+						queue.signal(semaphore);
+
+						// recycle semaphore:
+						free_semaphore(semaphore);
+					}
+					commandlist.signals.clear();
 				}
 
 				for (auto& x : commandlist.pipelines_worker)
@@ -5439,14 +5452,7 @@ using namespace dx12_internal;
 				if (queue.queue == nullptr)
 					continue;
 
-				if (!queue.submit_cmds.empty())
-				{
-					queue.queue->ExecuteCommandLists(
-						(UINT)queue.submit_cmds.size(),
-						queue.submit_cmds.data()
-					);
-					queue.submit_cmds.clear();
-				}
+				queue.submit();
 
 				hr = queue.queue->Signal(frame_fence[GetBufferIndex()][q].Get(), 1);
 				assert(SUCCEEDED(hr));
@@ -5947,8 +5953,14 @@ using namespace dx12_internal;
 		CommandList_DX12& commandlist = GetCommandList(cmd);
 		CommandList_DX12& commandlist_wait_for = GetCommandList(wait_for);
 		assert(commandlist_wait_for.id < commandlist.id); // can't wait for future command list!
-		commandlist.waits.push_back(wait_for);
-		commandlist_wait_for.waited_on.store(true);
+		Semaphore semaphore = new_semaphore();
+		commandlist.waits.push_back(semaphore);
+		commandlist_wait_for.signals.push_back(semaphore);
+	}
+	void GraphicsDevice_DX12::WaitQueue(CommandList cmd, QUEUE_TYPE wait_for)
+	{
+		CommandList_DX12& commandlist = GetCommandList(cmd);
+		commandlist.wait_queues.push_back(std::make_pair(wait_for, new_semaphore()));
 	}
 	void GraphicsDevice_DX12::RenderPassBegin(const SwapChain* swapchain, CommandList cmd)
 	{
diff --git a/WickedEngine/wiGraphicsDevice_DX12.h b/WickedEngine/wiGraphicsDevice_DX12.h
index e14939336..5d76f9eb1 100644
--- a/WickedEngine/wiGraphicsDevice_DX12.h
+++ b/WickedEngine/wiGraphicsDevice_DX12.h
@@ -71,17 +71,22 @@ namespace wi::graphics
 		D3D12_CPU_DESCRIPTOR_HANDLE nullUAV = {};
 		D3D12_CPU_DESCRIPTOR_HANDLE nullSAM = {};
 
+		struct Semaphore
+		{
+			Microsoft::WRL::ComPtr<ID3D12Fence> fence;
+			uint64_t fenceValue = 0;
+		};
+
 		struct CommandQueue
 		{
 			D3D12_COMMAND_QUEUE_DESC desc = {};
 			Microsoft::WRL::ComPtr<ID3D12CommandQueue> queue;
-			Microsoft::WRL::ComPtr<ID3D12Fence> fence;
 			wi::vector<ID3D12CommandList*> submit_cmds;
-		} queues[QUEUE_COUNT];
 
-#ifdef PLATFORM_XBOX
-		std::mutex queue_locker;
-#endif // PLATFORM_XBOX
+			void signal(const Semaphore& semaphore);
+			void wait(const Semaphore& semaphore);
+			void submit();
+		} queues[QUEUE_COUNT];
 
 		struct CopyAllocator
 		{
@@ -124,6 +129,28 @@ namespace wi::graphics
 			void flush(bool graphics, CommandList cmd);
 		};
 
+		wi::vector<Semaphore> semaphore_pool;
+		std::mutex semaphore_pool_locker;
+		Semaphore new_semaphore()
+		{
+			std::scoped_lock lck(semaphore_pool_locker);
+			if (semaphore_pool.empty())
+			{
+				Semaphore& dependency = semaphore_pool.emplace_back();
+				HRESULT hr = device->CreateFence(0, D3D12_FENCE_FLAG_NONE, PPV_ARGS(dependency.fence));
+				assert(SUCCEEDED(hr));
+			}
+			Semaphore semaphore = std::move(semaphore_pool.back());
+			semaphore_pool.pop_back();
+			semaphore.fenceValue++;
+			return semaphore;
+		}
+		void free_semaphore(const Semaphore& semaphore)
+		{
+			std::scoped_lock lck(semaphore_pool_locker);
+			semaphore_pool.push_back(semaphore);
+		}
+
 		struct CommandList_DX12
 		{
 			Microsoft::WRL::ComPtr<ID3D12CommandAllocator> commandAllocators[BUFFERCOUNT][QUEUE_COUNT];
@@ -133,8 +160,9 @@ namespace wi::graphics
 
 			QUEUE_TYPE queue = {};
 			uint32_t id = 0;
-			wi::vector<CommandList> waits;
-			std::atomic_bool waited_on{ false };
+			wi::vector<std::pair<QUEUE_TYPE, Semaphore>> wait_queues;
+			wi::vector<Semaphore> waits;
+			wi::vector<Semaphore> signals;
 
 			DescriptorBinder binder;
 			GPULinearAllocator frame_allocators[BUFFERCOUNT];
@@ -176,7 +204,9 @@ namespace wi::graphics
 			void reset(uint32_t bufferindex)
 			{
 				buffer_index = bufferindex;
+				wait_queues.clear();
 				waits.clear();
+				signals.clear();
 				binder.reset();
 				frame_allocators[buffer_index].reset();
 				prev_pt = D3D_PRIMITIVE_TOPOLOGY_UNDEFINED;
@@ -336,6 +366,7 @@ namespace wi::graphics
 		///////////////Thread-sensitive////////////////////////
 
 		void WaitCommandList(CommandList cmd, CommandList wait_for) override;
+		void WaitQueue(CommandList cmd, QUEUE_TYPE wait_for) override;
 		void RenderPassBegin(const SwapChain* swapchain, CommandList cmd) override;
 		void RenderPassBegin(const RenderPassImage* images, uint32_t image_count, CommandList cmd, RenderPassFlags flags = RenderPassFlags::NONE) override;
 		void RenderPassEnd(CommandList cmd) override;
diff --git a/WickedEngine/wiGraphicsDevice_Vulkan.cpp b/WickedEngine/wiGraphicsDevice_Vulkan.cpp
index 8d3d01672..f08b90580 100644
--- a/WickedEngine/wiGraphicsDevice_Vulkan.cpp
+++ b/WickedEngine/wiGraphicsDevice_Vulkan.cpp
@@ -348,16 +348,20 @@ namespace vulkan_internal
 		case ResourceState::UNORDERED_ACCESS:
 			return VK_IMAGE_LAYOUT_GENERAL;
 		case ResourceState::COPY_SRC:
-			return VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
 		case ResourceState::COPY_DST:
-			return VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+			// we can't assume transfer layout because it's allowed for resource to be used by multiple queues like DX12 (decay to common state), so this is a workaround
+			//	the problem is that image copy commands will require specifying the current layout, but different queues can often use textures in different layouts
+			return VK_IMAGE_LAYOUT_GENERAL;
 		case ResourceState::SHADING_RATE_SOURCE:
 			return VK_IMAGE_LAYOUT_FRAGMENT_SHADING_RATE_ATTACHMENT_OPTIMAL_KHR;
 		case ResourceState::VIDEO_DECODE_SRC:
 		case ResourceState::VIDEO_DECODE_DST:
 			return VK_IMAGE_LAYOUT_VIDEO_DECODE_DPB_KHR;
 		default:
-			return VK_IMAGE_LAYOUT_UNDEFINED;
+			// combination of state flags will default to general
+			//	whether the combination of states is valid needs to be validated by the user
+			//	combining read-only states should be fine
+			return VK_IMAGE_LAYOUT_GENERAL;
 		}
 	}
 	constexpr VkShaderStageFlags _ConvertStageFlags(ShaderStage value)
@@ -741,6 +745,7 @@ namespace vulkan_internal
 		std::shared_ptr<GraphicsDevice_Vulkan::AllocationHandler> allocationhandler;
 		VmaAllocation allocation = nullptr;
 		VkImage resource = VK_NULL_HANDLE;
+		VkImageLayout defaultLayout = VK_IMAGE_LAYOUT_GENERAL;
 		VkBuffer staging_resource = VK_NULL_HANDLE;
 		struct TextureSubresource
 		{
@@ -1321,6 +1326,26 @@ using namespace vulkan_internal;
 
 
 
+	void GraphicsDevice_Vulkan::CommandQueue::signal(VkSemaphore semaphore)
+	{
+		if (queue == VK_NULL_HANDLE)
+			return;
+		VkSemaphoreSubmitInfo& signalSemaphore = submit_signalSemaphoreInfos.emplace_back();
+		signalSemaphore.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO;
+		signalSemaphore.semaphore = semaphore;
+		signalSemaphore.value = 0; // not a timeline semaphore
+		signalSemaphore.stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT;
+	}
+	void GraphicsDevice_Vulkan::CommandQueue::wait(VkSemaphore semaphore)
+	{
+		if (queue == VK_NULL_HANDLE)
+			return;
+		VkSemaphoreSubmitInfo& waitSemaphore = submit_waitSemaphoreInfos.emplace_back();
+		waitSemaphore.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO;
+		waitSemaphore.semaphore = semaphore;
+		waitSemaphore.value = 0; // not a timeline semaphore
+		waitSemaphore.stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT;
+	}
 	void GraphicsDevice_Vulkan::CommandQueue::submit(GraphicsDevice_Vulkan* device, VkFence fence)
 	{
 		if (queue == VK_NULL_HANDLE)
@@ -1848,8 +1873,7 @@ using namespace vulkan_internal;
 							auto texture_internal = to_internal((const Texture*)&resource);
 							auto& subresource_descriptor = subresource >= 0 ? texture_internal->subresources_srv[subresource] : texture_internal->srv;
 							imageInfos.back().imageView = subresource_descriptor.image_view;
-
-							imageInfos.back().imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+							imageInfos.back().imageLayout = texture_internal->defaultLayout;
 						}
 					}
 					break;
@@ -3607,13 +3631,17 @@ using namespace vulkan_internal;
 			{
 				x.destroy();
 			}
-			vkDestroySemaphore(device, commandlist->semaphore, nullptr);
 		}
 		for (auto& x : pipelines_global)
 		{
 			vkDestroyPipeline(device, x.second, nullptr);
 		}
 
+		for (auto& x : semaphore_pool)
+		{
+			vkDestroySemaphore(device, x, nullptr);
+		}
+
 		vmaDestroyBuffer(allocationhandler->allocator, nullBuffer, nullBufferAllocation);
 		vkDestroyBufferView(device, nullBufferView, nullptr);
 		vmaDestroyImage(allocationhandler->allocator, nullImage1D, nullImageAllocation1D);
@@ -4056,6 +4084,7 @@ using namespace vulkan_internal;
 	{
 		auto internal_state = std::make_shared<Texture_Vulkan>();
 		internal_state->allocationhandler = allocationhandler;
+		internal_state->defaultLayout = _ConvertImageLayout(desc->layout);
 		texture->internal_state = internal_state;
 		texture->type = GPUResource::Type::TEXTURE;
 		texture->mapped_data = nullptr;
@@ -7045,7 +7074,6 @@ using namespace vulkan_internal;
 		commandlist.reset(GetBufferIndex());
 		commandlist.queue = queue;
 		commandlist.id = cmd_current;
-		commandlist.waited_on.store(false);
 
 		if (commandlist.GetCommandBuffer() == VK_NULL_HANDLE)
 		{
@@ -7090,11 +7118,6 @@ using namespace vulkan_internal;
 				commandlist.binder_pools[buffer].init(this);
 			}
 
-			VkSemaphoreCreateInfo createInfo = {};
-			createInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
-			res = vkCreateSemaphore(device, &createInfo, nullptr, &commandlist.semaphore);
-			assert(res == VK_SUCCESS);
-
 			commandlist.binder.init(this);
 		}
 
@@ -7157,6 +7180,14 @@ using namespace vulkan_internal;
 				assert(res == VK_SUCCESS);
 
 				CommandQueue& queue = queues[commandlist.queue];
+				const bool dependency = !commandlist.signals.empty() || !commandlist.waits.empty() || !commandlist.wait_queues.empty();
+
+				if (dependency)
+				{
+					// If the current commandlist must resolve a dependency, then previous ones will be submitted before doing that:
+					//	This improves GPU utilization because not the whole batch of command lists will need to synchronize, but only the one that handles it
+					queue.submit(this, VK_NULL_HANDLE);
+				}
 
 				VkCommandBufferSubmitInfo& cbSubmitInfo = queue.submit_cmds.emplace_back();
 				cbSubmitInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO;
@@ -7183,29 +7214,43 @@ using namespace vulkan_internal;
 					signalSemaphore.value = 0; // not a timeline semaphore
 				}
 
-				if (commandlist.waited_on.load() || !commandlist.waits.empty())
+				if (dependency)
 				{
-					for (auto& wait : commandlist.waits)
+					for (auto& wait : commandlist.wait_queues)
+					{
+						CommandQueue& waitqueue = queues[wait.first];
+						VkSemaphore semaphore = wait.second;
+
+						// The WaitQueue operation will submit and signal the specified dependency queue:
+						waitqueue.signal(semaphore); // signal recorded, will be executed at submit
+						waitqueue.submit(this, VK_NULL_HANDLE);
+
+						// The current queue will be waiting for the dependency queue to complete:
+						queue.wait(semaphore);
+
+						// recycle semaphore
+						free_semaphore(semaphore);
+					}
+					commandlist.wait_queues.clear();
+
+					for (auto& semaphore : commandlist.waits)
 					{
 						// Wait for command list dependency:
-						CommandList_Vulkan& waitcommandlist = GetCommandList(wait);
+						queue.wait(semaphore);
 
-						VkSemaphoreSubmitInfo& waitSemaphore = queue.submit_waitSemaphoreInfos.emplace_back();
-						waitSemaphore.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO;
-						waitSemaphore.semaphore = waitcommandlist.semaphore;
-						waitSemaphore.value = 0; // not a timeline semaphore
-						waitSemaphore.stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT;
+						// semaphore is not recycled here, only the signals recycle themselves vecause wait will use the same
 					}
+					commandlist.waits.clear();
 
-					if (commandlist.waited_on.load())
+					for (auto& semaphore : commandlist.signals)
 					{
 						// Signal this command list's completion:
-						VkSemaphoreSubmitInfo& signalSemaphore = queue.submit_signalSemaphoreInfos.emplace_back();
-						signalSemaphore.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO;
-						signalSemaphore.semaphore = commandlist.semaphore;
-						signalSemaphore.value = 0; // not a timeline semaphore
-						signalSemaphore.stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT;
+						queue.signal(semaphore);
+
+						// recycle semaphore
+						free_semaphore(semaphore);
 					}
+					commandlist.signals.clear();
 
 					queue.submit(this, VK_NULL_HANDLE);
 				}
@@ -7556,8 +7601,14 @@ using namespace vulkan_internal;
 		CommandList_Vulkan& commandlist = GetCommandList(cmd);
 		CommandList_Vulkan& commandlist_wait_for = GetCommandList(wait_for);
 		assert(commandlist_wait_for.id < commandlist.id); // can't wait for future command list!
-		commandlist.waits.push_back(wait_for);
-		commandlist_wait_for.waited_on.store(true);
+		VkSemaphore semaphore = new_semaphore();
+		commandlist.waits.push_back(semaphore);
+		commandlist_wait_for.signals.push_back(semaphore);
+	}
+	void GraphicsDevice_Vulkan::WaitQueue(CommandList cmd, QUEUE_TYPE wait_for)
+	{
+		CommandList_Vulkan& commandlist = GetCommandList(cmd);
+		commandlist.wait_queues.push_back(std::make_pair(wait_for, new_semaphore()));
 	}
 	void GraphicsDevice_Vulkan::RenderPassBegin(const SwapChain* swapchain, CommandList cmd)
 	{
@@ -8435,7 +8486,7 @@ using namespace vulkan_internal;
 						commandlist.GetCommandBuffer(),
 						internal_state_src->staging_resource,
 						internal_state_dst->resource,
-						VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+						_ConvertImageLayout(ResourceState::COPY_DST),
 						1,
 						&copy
 					);
@@ -8473,7 +8524,7 @@ using namespace vulkan_internal;
 						vkCmdCopyImageToBuffer(
 							commandlist.GetCommandBuffer(),
 							internal_state_src->resource,
-							VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+							_ConvertImageLayout(ResourceState::COPY_SRC),
 							internal_state_dst->staging_resource,
 							1,
 							&copy
@@ -8536,8 +8587,8 @@ using namespace vulkan_internal;
 				copy.dstSubresource.mipLevel = 0;
 
 				vkCmdCopyImage(commandlist.GetCommandBuffer(),
-					internal_state_src->resource, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
-					internal_state_dst->resource, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+					internal_state_src->resource, _ConvertImageLayout(ResourceState::COPY_SRC),
+					internal_state_dst->resource, _ConvertImageLayout(ResourceState::COPY_DST),
 					1, &copy
 				);
 			}
@@ -8633,9 +8684,9 @@ using namespace vulkan_internal;
 		vkCmdCopyImage(
 			commandlist.GetCommandBuffer(),
 			src_internal->resource,
-			VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+			_ConvertImageLayout(ResourceState::COPY_SRC),
 			dst_internal->resource,
-			VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+			_ConvertImageLayout(ResourceState::COPY_DST),
 			1,
 			&copy
 		);
diff --git a/WickedEngine/wiGraphicsDevice_Vulkan.h b/WickedEngine/wiGraphicsDevice_Vulkan.h
index 0d19a05ad..09283f2b4 100644
--- a/WickedEngine/wiGraphicsDevice_Vulkan.h
+++ b/WickedEngine/wiGraphicsDevice_Vulkan.h
@@ -120,6 +120,8 @@ namespace wi::graphics
 			bool sparse_binding_supported = false;
 			std::shared_ptr<std::mutex> locker;
 
+			void signal(VkSemaphore semaphore);
+			void wait(VkSemaphore semaphore);
 			void submit(GraphicsDevice_Vulkan* device, VkFence fence);
 
 		} queues[QUEUE_COUNT];
@@ -193,17 +195,40 @@ namespace wi::graphics
 			void reset();
 		};
 
+		wi::vector<VkSemaphore> semaphore_pool;
+		std::mutex semaphore_pool_locker;
+		VkSemaphore new_semaphore()
+		{
+			std::scoped_lock lck(semaphore_pool_locker);
+			if (semaphore_pool.empty())
+			{
+				VkSemaphore& sema = semaphore_pool.emplace_back();
+				VkSemaphoreCreateInfo info = {};
+				info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+				VkResult res = vkCreateSemaphore(device, &info, nullptr, &sema);
+				assert(res == VK_SUCCESS);
+			}
+			VkSemaphore semaphore = semaphore_pool.back();
+			semaphore_pool.pop_back();
+			return semaphore;
+		}
+		void free_semaphore(VkSemaphore semaphore)
+		{
+			std::scoped_lock lck(semaphore_pool_locker);
+			semaphore_pool.push_back(semaphore);
+		}
+
 		struct CommandList_Vulkan
 		{
-			VkSemaphore semaphore = VK_NULL_HANDLE;
 			VkCommandPool commandPools[BUFFERCOUNT][QUEUE_COUNT] = {};
 			VkCommandBuffer commandBuffers[BUFFERCOUNT][QUEUE_COUNT] = {};
 			uint32_t buffer_index = 0;
 
 			QUEUE_TYPE queue = {};
 			uint32_t id = 0;
-			wi::vector<CommandList> waits;
-			std::atomic_bool waited_on{ false };
+			wi::vector<std::pair<QUEUE_TYPE, VkSemaphore>> wait_queues;
+			wi::vector<VkSemaphore> waits;
+			wi::vector<VkSemaphore> signals;
 
 			DescriptorBinder binder;
 			DescriptorBinderPool binder_pools[BUFFERCOUNT];
@@ -229,7 +254,9 @@ namespace wi::graphics
 			void reset(uint32_t bufferindex)
 			{
 				buffer_index = bufferindex;
+				wait_queues.clear();
 				waits.clear();
+				signals.clear();
 				binder_pools[buffer_index].reset();
 				binder.reset();
 				frame_allocators[buffer_index].reset();
@@ -370,6 +397,7 @@ namespace wi::graphics
 		///////////////Thread-sensitive////////////////////////
 
 		void WaitCommandList(CommandList cmd, CommandList wait_for) override;
+		void WaitQueue(CommandList cmd, QUEUE_TYPE wait_for) override;
 		void RenderPassBegin(const SwapChain* swapchain, CommandList cmd) override;
 		void RenderPassBegin(const RenderPassImage* images, uint32_t image_count, CommandList cmd, RenderPassFlags flags = RenderPassFlags::NONE) override;
 		void RenderPassEnd(CommandList cmd) override;
diff --git a/WickedEngine/wiOcean.cpp b/WickedEngine/wiOcean.cpp
index 5d22ebcff..06dce6d55 100644
--- a/WickedEngine/wiOcean.cpp
+++ b/WickedEngine/wiOcean.cpp
@@ -187,6 +187,7 @@ namespace wi
 		SubresourceData initdata;
 		initdata.data_ptr = displacementdata.data();
 		initdata.row_pitch = tex_desc.width * sizeof(XMFLOAT4);
+		tex_desc.layout = ResourceState::COPY_SRC | ResourceState::SHADER_RESOURCE_COMPUTE;
 		device->CreateTexture(&tex_desc, &initdata, &displacementMap);
 		device->SetName(&displacementMap, "displacementMap");
 
@@ -429,16 +430,18 @@ namespace wi
 
 		wi::renderer::GenerateMipChain(gradientMap, wi::renderer::MIPGENFILTER_LINEAR, cmd);
 
-		// Copy displacement map to readback:
-		device->Barrier(GPUBarrier::Image(&displacementMap, displacementMap.desc.layout, ResourceState::COPY_SRC), cmd);
-		device->CopyResource(&displacementMap_readback[displacement_readback_index], &displacementMap, cmd);
-		displacement_readback_valid[displacement_readback_index] = true;
-		displacement_readback_index = (displacement_readback_index + 1) % device->GetBufferCount();
-		device->Barrier(GPUBarrier::Image(&displacementMap, ResourceState::COPY_SRC, displacementMap.desc.layout), cmd);
-
 		device->EventEnd(cmd);
 	}
 
+	void Ocean::CopyDisplacementMapReadback(wi::graphics::CommandList cmd) const
+	{
+		GraphicsDevice* device = wi::graphics::GetDevice();
+		device->EventBegin("Ocean Readback Copy", cmd);
+		device->CopyResource(&displacementMap_readback[displacement_readback_index], &displacementMap, cmd);
+		displacement_readback_valid[displacement_readback_index] = true;
+		displacement_readback_index = (displacement_readback_index + 1) % device->GetBufferCount();
+		device->EventEnd(cmd);
+	}
 
 	void Ocean::Render(const CameraComponent& camera, CommandList cmd) const
 	{
diff --git a/WickedEngine/wiOcean.h b/WickedEngine/wiOcean.h
index 47b00746b..a4f7157a5 100644
--- a/WickedEngine/wiOcean.h
+++ b/WickedEngine/wiOcean.h
@@ -43,6 +43,8 @@ namespace wi
 		void UpdateDisplacementMap(wi::graphics::CommandList cmd) const;
 		void Render(const wi::scene::CameraComponent& camera, wi::graphics::CommandList cmd) const;
 
+		void CopyDisplacementMapReadback(wi::graphics::CommandList cmd) const;
+
 		const wi::graphics::Texture* getDisplacementMap() const;
 		const wi::graphics::Texture* getGradientMap() const;
 
diff --git a/WickedEngine/wiRenderPath3D.cpp b/WickedEngine/wiRenderPath3D.cpp
index a63c210b2..91492dee7 100644
--- a/WickedEngine/wiRenderPath3D.cpp
+++ b/WickedEngine/wiRenderPath3D.cpp
@@ -800,6 +800,7 @@ namespace wi
 		if (scene->terrains.GetCount() > 0)
 		{
 			cmd_copypages = device->BeginCommandList(QUEUE_COPY);
+			device->WaitQueue(cmd_copypages, QUEUE_GRAPHICS); // sync to prev frame graphics
 			wi::jobsystem::Execute(ctx, [this, cmd_copypages](wi::jobsystem::JobArgs args) {
 				for (size_t i = 0; i < scene->terrains.GetCount(); ++i)
 				{
@@ -810,6 +811,7 @@ namespace wi
 
 		// Preparing the frame:
 		CommandList cmd = device->BeginCommandList();
+		device->WaitQueue(cmd, QUEUE_COMPUTE); // sync to prev frame compute (disallow prev frame overlapping a compute task into updating global scene resources for this frame)
 		CommandList cmd_prepareframe = cmd;
 		wi::renderer::ProcessDeferredTextureRequests(cmd); // Execute it first thing in the frame here, on main thread, to not allow other thread steal it and execute on different command list!
 		wi::jobsystem::Execute(ctx, [this, cmd](wi::jobsystem::JobArgs args) {
@@ -837,7 +839,7 @@ namespace wi
 
 		});
 
-		//	async compute parallel with depth prepass
+		// async compute parallel with depth prepass
 		cmd = device->BeginCommandList(QUEUE_COMPUTE);
 		CommandList cmd_prepareframe_async = cmd;
 		device->WaitCommandList(cmd, cmd_prepareframe);
@@ -1111,13 +1113,26 @@ namespace wi
 
 		});
 
+		CommandList cmd_ocean;
+		if (scene->weather.IsOceanEnabled() && scene->ocean.IsValid())
+		{
+			// Ocean simulation can be updated async to opaque passes:
+			cmd_ocean = device->BeginCommandList(QUEUE_COMPUTE);
+			wi::renderer::UpdateOcean(visibility_main, cmd_ocean);
+
+			// Copying to readback is done on copy queue to use DMA instead of compute warps:
+			CommandList cmd_oceancopy = device->BeginCommandList(QUEUE_COPY);
+			device->WaitCommandList(cmd_oceancopy, cmd_ocean);
+			wi::renderer::ReadbackOcean(visibility_main, cmd_oceancopy);
+		}
+
 		// Shadow maps:
 		if (getShadowsEnabled())
 		{
 			cmd = device->BeginCommandList();
 			wi::jobsystem::Execute(ctx, [this, cmd](wi::jobsystem::JobArgs args) {
 				wi::renderer::DrawShadowmaps(visibility_main, cmd);
-				});
+			});
 		}
 
 		if (wi::renderer::GetVXGIEnabled() && getSceneUpdateEnabled())
@@ -1331,12 +1346,49 @@ namespace wi
 			});
 		}
 
-		if (scene->weather.IsOceanEnabled())
+		// Main camera weather compute effects depending on shadow maps, envmaps, etc, but don't depend on async surface pass:
+		if (scene->weather.IsRealisticSky() || scene->weather.IsVolumetricClouds())
 		{
-			// Ocean simulation can be updated async to opaque passes:
-			CommandList cmd_ocean = device->BeginCommandList(QUEUE_COMPUTE);
-			device->WaitCommandList(cmd_ocean, cmd);
-			wi::renderer::UpdateOcean(visibility_main, cmd_ocean);
+			cmd = device->BeginCommandList();
+			wi::jobsystem::Execute(ctx, [this, cmd](wi::jobsystem::JobArgs args) {
+
+				wi::renderer::BindCameraCB(
+					*camera,
+					camera_previous,
+					camera_reflection,
+					cmd
+				);
+
+				if (scene->weather.IsRealisticSky())
+				{
+					wi::renderer::ComputeSkyAtmosphereSkyViewLut(cmd);
+
+					if (scene->weather.IsRealisticSkyAerialPerspective())
+					{
+						wi::renderer::ComputeSkyAtmosphereCameraVolumeLut(cmd);
+					}
+				}
+				if (scene->weather.IsRealisticSky() && scene->weather.IsRealisticSkyAerialPerspective())
+				{
+					wi::renderer::Postprocess_AerialPerspective(
+						aerialperspectiveResources,
+						cmd
+					);
+				}
+				if (scene->weather.IsVolumetricClouds())
+				{
+					wi::renderer::Postprocess_VolumetricClouds(
+						volumetriccloudResources,
+						cmd,
+						*camera,
+						camera_previous,
+						camera_reflection,
+						wi::renderer::GetTemporalAAEnabled() || getFSR2Enabled(),
+						scene->weather.volumetricCloudsWeatherMapFirst.IsValid() ? &scene->weather.volumetricCloudsWeatherMapFirst.GetTexture() : nullptr,
+						scene->weather.volumetricCloudsWeatherMapSecond.IsValid() ? &scene->weather.volumetricCloudsWeatherMapSecond.GetTexture() : nullptr
+					);
+				}
+			});
 		}
 
 		// Main camera opaque color pass:
@@ -1354,17 +1406,6 @@ namespace wi
 				cmd
 			);
 
-			// This can't run in "main camera compute effects" async compute,
-			//	because it depends on shadow maps, and envmaps
-			if (scene->weather.IsRealisticSky())
-			{
-				wi::renderer::ComputeSkyAtmosphereSkyViewLut(cmd);
-
-				if (scene->weather.IsRealisticSkyAerialPerspective())
-				{
-					wi::renderer::ComputeSkyAtmosphereCameraVolumeLut(cmd);
-				}
-			}
 			if (getRaytracedReflectionEnabled())
 			{
 				wi::renderer::Postprocess_RTReflection(
@@ -1395,26 +1436,6 @@ namespace wi
 					cmd
 				);
 			}
-			if (scene->weather.IsRealisticSky() && scene->weather.IsRealisticSkyAerialPerspective())
-			{
-				wi::renderer::Postprocess_AerialPerspective(
-					aerialperspectiveResources,
-					cmd
-				);
-			}
-			if (scene->weather.IsVolumetricClouds())
-			{
-				wi::renderer::Postprocess_VolumetricClouds(
-					volumetriccloudResources,
-					cmd,
-					*camera,
-					camera_previous,
-					camera_reflection,
-					wi::renderer::GetTemporalAAEnabled() || getFSR2Enabled(),
-					scene->weather.volumetricCloudsWeatherMapFirst.IsValid() ? &scene->weather.volumetricCloudsWeatherMapFirst.GetTexture() : nullptr,
-					scene->weather.volumetricCloudsWeatherMapSecond.IsValid() ? &scene->weather.volumetricCloudsWeatherMapSecond.GetTexture() : nullptr
-				);
-			}
 
 			// Depth buffers were created on COMPUTE queue, so make them available for pixel shaders here:
 			{
@@ -1593,6 +1614,10 @@ namespace wi
 
 		// Transparents, post processes, etc:
 		cmd = device->BeginCommandList();
+		if (cmd_ocean.IsValid())
+		{
+			device->WaitCommandList(cmd, cmd_ocean);
+		}
 		wi::jobsystem::Execute(ctx, [this, cmd](wi::jobsystem::JobArgs args) {
 
 			GraphicsDevice* device = wi::graphics::GetDevice();
@@ -1613,8 +1638,6 @@ namespace wi
 
 			RenderTransparents(cmd);
 
-			RenderPostprocessChain(cmd);
-
 			// Depth buffers expect a non-pixel shader resource state as they are generated on compute queue:
 			{
 				GPUBarrier barriers[] = {
@@ -1624,20 +1647,24 @@ namespace wi
 				};
 				device->Barrier(barriers, arraysize(barriers), cmd);
 			}
-
-			wi::renderer::TextureStreamingReadbackCopy(*scene, cmd);
 		});
 
 		if (scene->IsWetmapProcessingRequired())
 		{
-			CommandList cmd_wetmaps = device->BeginCommandList(QUEUE_COMPUTE);
-			device->WaitCommandList(cmd_wetmaps, cmd); // wait for transparents, it will be scheduled with late frame (GUI, etc)
+			CommandList wetmap_cmd = device->BeginCommandList(QUEUE_COMPUTE);
+			device->WaitCommandList(wetmap_cmd, cmd); // wait for transparents, it will be scheduled with late frame (GUI, etc)
 			// Note: GPU processing of this compute task can overlap with beginning of the next frame because no one is waiting for it
-			wi::jobsystem::Execute(ctx, [this, cmd_wetmaps](wi::jobsystem::JobArgs args) {
-				wi::renderer::RefreshWetmaps(*scene, cmd_wetmaps);
+			wi::jobsystem::Execute(ctx, [this, wetmap_cmd](wi::jobsystem::JobArgs args) {
+				wi::renderer::RefreshWetmaps(visibility_main, wetmap_cmd);
 			});
 		}
 
+		cmd = device->BeginCommandList();
+		wi::jobsystem::Execute(ctx, [this, cmd](wi::jobsystem::JobArgs args) {
+			RenderPostprocessChain(cmd);
+			wi::renderer::TextureStreamingReadbackCopy(*scene, cmd);
+		});
+
 		RenderPath2D::Render();
 
 		wi::jobsystem::Wait(ctx);
@@ -1995,6 +2022,7 @@ namespace wi
 		);
 
 		// Note: volumetrics and light shafts are blended before transparent scene, because they used depth of the opaques
+		//	But the ocean is special, because it does have depth for them implicitly computed from ocean plane
 
 		if (getVolumeLightsEnabled() && visibility_main.IsRequestedVolumetricLights())
 		{
@@ -2131,6 +2159,9 @@ namespace wi
 	{
 		GraphicsDevice* device = wi::graphics::GetDevice();
 
+		wi::renderer::BindCommonResources(cmd);
+		wi::renderer::BindCameraCB(*camera, camera_previous, camera_reflection, cmd);
+
 		const Texture* rt_first = nullptr; // not ping-ponged with read / write
 		const Texture* rt_read = &rtMain;
 		const Texture* rt_write = &rtPostprocess;
diff --git a/WickedEngine/wiRenderer.cpp b/WickedEngine/wiRenderer.cpp
index 4938d2d99..9e97ee6ff 100644
--- a/WickedEngine/wiRenderer.cpp
+++ b/WickedEngine/wiRenderer.cpp
@@ -4570,6 +4570,18 @@ void UpdateRenderDataAsync(
 
 	BindCommonResources(cmd);
 
+	// Wetmaps will be initialized:
+	for (uint32_t objectIndex = 0; objectIndex < vis.scene->objects.GetCount(); ++objectIndex)
+	{
+		const ObjectComponent& object = vis.scene->objects[objectIndex];
+		if (!object.wetmap.IsValid() || object.wetmap_cleared)
+			continue;
+		device->ClearUAV(&object.wetmap, 0, cmd);
+		object.wetmap_cleared = true;
+		barrier_stack.push_back(GPUBarrier::Buffer(&object.wetmap, ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE_COMPUTE));
+	}
+	barrier_stack_flush(cmd);
+
 	// Precompute static volumetric cloud textures:
 	if (!volumetric_clouds_precomputed && vis.scene->weather.IsVolumetricClouds())
 	{
@@ -4766,6 +4778,8 @@ void UpdateOcean(
 	CommandList cmd
 )
 {
+	if (!vis.scene->weather.IsOceanEnabled() || !vis.scene->ocean.IsValid())
+		return;
 	bool occluded = false;
 	if (vis.flags & wi::renderer::Visibility::ALLOW_OCCLUSION_CULLING)
 	{
@@ -4779,6 +4793,23 @@ void UpdateOcean(
 		wi::profiler::EndRange(range);
 	}
 }
+void ReadbackOcean(
+	const Visibility& vis,
+	CommandList cmd
+)
+{
+	if (!vis.scene->weather.IsOceanEnabled() || !vis.scene->ocean.IsValid())
+		return;
+	bool occluded = false;
+	if (vis.flags & wi::renderer::Visibility::ALLOW_OCCLUSION_CULLING)
+	{
+		occluded = vis.scene->ocean.IsOccluded();
+	}
+	if (!occluded)
+	{
+		vis.scene->ocean.CopyDisplacementMapReadback(cmd);
+	}
+}
 
 void UpdateRaytracingAccelerationStructures(const Scene& scene, CommandList cmd)
 {
@@ -6193,7 +6224,7 @@ void DrawScene(
 
 	BindCommonResources(cmd);
 
-	if (ocean && !skip_planar_reflection_objects && vis.scene->weather.IsOceanEnabled())
+	if (ocean && !skip_planar_reflection_objects && vis.scene->weather.IsOceanEnabled() && vis.scene->ocean.IsValid())
 	{
 		if (!occlusion || !vis.scene->ocean.IsOccluded())
 		{
@@ -9906,9 +9937,9 @@ void RefreshLightmaps(const Scene& scene, CommandList cmd)
 	}
 }
 
-void RefreshWetmaps(const Scene& scene, CommandList cmd)
+void RefreshWetmaps(const Visibility& vis, CommandList cmd)
 {
-	if (!scene.IsWetmapProcessingRequired())
+	if (!vis.scene->IsWetmapProcessingRequired())
 		return;
 
 	device->EventBegin("RefreshWetmaps", cmd);
@@ -9916,43 +9947,44 @@ void RefreshWetmaps(const Scene& scene, CommandList cmd)
 	BindCommonResources(cmd);
 	device->BindComputeShader(&shaders[CSTYPE_WETMAP_UPDATE], cmd);
 
-	for (uint32_t objectIndex = 0; objectIndex < scene.objects.GetCount(); ++objectIndex)
+	WetmapPush push = {};
+	push.rain_amount = vis.scene->weather.rain_amount;
+
+	// Note: every object wetmap is updated, not just visible
+	for (uint32_t objectIndex = 0; objectIndex < vis.scene->objects.GetCount(); ++objectIndex)
 	{
-		const ObjectComponent& object = scene.objects[objectIndex];
+		const ObjectComponent& object = vis.scene->objects[objectIndex];
 		if (!object.wetmap.IsValid())
 			continue;
 
 		uint32_t vertexCount = uint32_t(object.wetmap.desc.size / GetFormatStride(object.wetmap.desc.format));
 
-		WetmapPush push = {};
 		push.wetmap = device->GetDescriptorIndex(&object.wetmap, SubresourceType::UAV);
 
 		if (push.wetmap < 0)
 			continue;
 
 		push.instanceID = objectIndex;
-		push.rain_amount = scene.weather.rain_amount;
 		device->PushConstants(&push, sizeof(push), cmd);
 
 		device->Dispatch((vertexCount + 63u) / 64u, 1, 1, cmd);
 	}
 
-	for (uint32_t hairIndex = 0; hairIndex < scene.hairs.GetCount(); ++hairIndex)
+	// Note: only visible hair particles will be updated, becasue invisible ones will not have valid vertices
+	for (uint32_t hairIndex : vis.visibleHairs)
 	{
-		const wi::HairParticleSystem& hair = scene.hairs[hairIndex];
+		const wi::HairParticleSystem& hair = vis.scene->hairs[hairIndex];
 		if (!hair.wetmap.IsValid())
 			continue;
 
 		uint32_t vertexCount = uint32_t(hair.wetmap.size / sizeof(uint16_t));
 
-		WetmapPush push = {};
 		push.wetmap = hair.wetmap.descriptor_uav;
 
 		if (push.wetmap < 0)
 			continue;
 
-		push.instanceID = uint32_t(scene.objects.GetCount() + hairIndex);
-		push.rain_amount = scene.weather.rain_amount;
+		push.instanceID = uint32_t(vis.scene->objects.GetCount() + hairIndex);
 		device->PushConstants(&push, sizeof(push), cmd);
 
 		device->Dispatch((vertexCount + 63u) / 64u, 1, 1, cmd);
@@ -16908,7 +16940,6 @@ void Postprocess_Downsample4x(
 
 	{
 		GPUBarrier barriers[] = {
-			GPUBarrier::Memory(),
 			GPUBarrier::Image(&output, ResourceState::UNORDERED_ACCESS, output.desc.layout),
 		};
 		device->Barrier(barriers, arraysize(barriers), cmd);
diff --git a/WickedEngine/wiRenderer.h b/WickedEngine/wiRenderer.h
index f57df18bb..6cbf30a95 100644
--- a/WickedEngine/wiRenderer.h
+++ b/WickedEngine/wiRenderer.h
@@ -205,6 +205,11 @@ namespace wi::renderer
 		const Visibility& vis,
 		wi::graphics::CommandList cmd
 	);
+	// Readback the ocean, can be on async compute or async copy
+	void ReadbackOcean(
+		const Visibility& vis,
+		wi::graphics::CommandList cmd
+	);
 
 	void UpdateRaytracingAccelerationStructures(const wi::scene::Scene& scene, wi::graphics::CommandList cmd);
 
@@ -312,7 +317,7 @@ namespace wi::renderer
 	// Call once per frame to render lightmaps
 	void RefreshLightmaps(const wi::scene::Scene& scene, wi::graphics::CommandList cmd);
 	// Call once per frame to render wetmaps
-	void RefreshWetmaps(const wi::scene::Scene& scene, wi::graphics::CommandList cmd);
+	void RefreshWetmaps(const Visibility& vis, wi::graphics::CommandList cmd);
 	// Run a compute shader that will resolve a MSAA depth buffer to a single-sample texture
 	void ResolveMSAADepthBuffer(const wi::graphics::Texture& dst, const wi::graphics::Texture& src, wi::graphics::CommandList cmd);
 	void DownsampleDepthBuffer(const wi::graphics::Texture& src, wi::graphics::CommandList cmd);
diff --git a/WickedEngine/wiScene.cpp b/WickedEngine/wiScene.cpp
index f0beb2422..1baaefaf8 100644
--- a/WickedEngine/wiScene.cpp
+++ b/WickedEngine/wiScene.cpp
@@ -1036,6 +1036,7 @@ namespace wi::scene
 
 		surfelgi = {};
 		ddgi = {};
+		ocean = {};
 
 		aabb_objects.clear();
 		aabb_lights.clear();
@@ -4053,10 +4054,9 @@ namespace wi::scene
 					desc.size = mesh.vertex_positions.size() * sizeof(uint16_t);
 					desc.format = Format::R16_UNORM;
 					desc.bind_flags = BindFlag::SHADER_RESOURCE | BindFlag::UNORDERED_ACCESS;
-					wi::vector<uint8_t> zeroes(desc.size);
-					std::fill(zeroes.begin(), zeroes.end(), 0);
-					device->CreateBuffer(&desc, zeroes.data(), &object.wetmap);
+					device->CreateBuffer(&desc, nullptr, &object.wetmap);
 					device->SetName(&object.wetmap, "wetmap");
+					object.wetmap_cleared = false;
 				}
 				else if(!object.IsWetmapEnabled() && object.wetmap.IsValid())
 				{
@@ -4759,6 +4759,10 @@ namespace wi::scene
 			{
 				OceanRegenerate();
 			}
+			if (!weather.IsOceanEnabled())
+			{
+				ocean = {};
+			}
 
 			// Ocean occlusion status:
 			if (!wi::renderer::GetFreezeCullingCameraEnabled() && weather.IsOceanEnabled())
diff --git a/WickedEngine/wiScene_Components.h b/WickedEngine/wiScene_Components.h
index 770b07e18..afa8d2d0e 100644
--- a/WickedEngine/wiScene_Components.h
+++ b/WickedEngine/wiScene_Components.h
@@ -850,6 +850,7 @@ namespace wi::scene
 		wi::graphics::GPUBuffer vb_ao;
 		int vb_ao_srv = -1;
 		wi::graphics::GPUBuffer wetmap;
+		mutable bool wetmap_cleared = false;
 
 		XMFLOAT3 center = XMFLOAT3(0, 0, 0);
 		float radius = 0;