From 9f2005113b7576ae37162d9a6c0522bb904f94c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Mon, 29 May 2023 10:53:50 +0200
Subject: [PATCH 1/4] Add a function to the dev menu to intentionally lose the
 VK device. To be used for debugging recovery.

---
 Common/GPU/Vulkan/VulkanContext.cpp | 65 +++++++++++++++++++----------
 Common/GPU/Vulkan/VulkanContext.h   |  8 +++-
 Common/GPU/Vulkan/VulkanLoader.cpp  |  1 +
 Common/GPU/Vulkan/thin3d_vulkan.cpp |  4 ++
 Common/GPU/thin3d.h                 |  2 +
 UI/DevScreens.cpp                   |  8 ++++
 6 files changed, 65 insertions(+), 23 deletions(-)

diff --git a/Common/GPU/Vulkan/VulkanContext.cpp b/Common/GPU/Vulkan/VulkanContext.cpp
index e57345cfab6d..3f4e6a85a7ec 100644
--- a/Common/GPU/Vulkan/VulkanContext.cpp
+++ b/Common/GPU/Vulkan/VulkanContext.cpp
@@ -1455,6 +1455,50 @@ bool VulkanContext::CreateShaderModule(const std::vector<uint32_t> &spirv, VkSha
 	}
 }
 
+// Only to be used for debugging lost device handling.
+// This works on NVIDIA to cause a lost device, need to try others.
+void VulkanContext::IntentionallyLoseDevice() {
+	_assert_(device_);
+	VkBufferCreateInfo b{ VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
+	b.size = 1024;
+	b.usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+	b.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+	VmaAllocationCreateInfo allocCreateInfo{};
+	allocCreateInfo.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+	VmaAllocationInfo allocInfo{};
+
+	VkBuffer buffer;
+	VmaAllocation alloc;
+
+	VkResult result = vmaCreateBuffer(Allocator(), &b, &allocCreateInfo, &buffer, &alloc, &allocInfo);
+	_assert_(result == VK_SUCCESS);
+
+	VkCommandPoolCreateInfo ci{ VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO };
+	VkCommandPool cmdPool;
+	vkCreateCommandPool(device_, &ci, nullptr, &cmdPool);
+	VkCommandBufferAllocateInfo cmdAllocInfo{ VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
+	cmdAllocInfo.commandPool = cmdPool;
+	cmdAllocInfo.commandBufferCount = 1;
+	cmdAllocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+	VkCommandBuffer cmdBuf;
+	vkAllocateCommandBuffers(device_, &cmdAllocInfo, &cmdBuf);
+	VkCommandBufferBeginInfo beginInfo{ VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
+	vkBeginCommandBuffer(cmdBuf, &beginInfo);
+	// Nonsense!
+	VkBufferCopy info{ 0, 1000000000, 100000 };
+	vkCmdCopyBuffer(cmdBuf, buffer, buffer, 1, &info);
+	vkEndCommandBuffer(cmdBuf);
+	VkSubmitInfo submitInfo{ VK_STRUCTURE_TYPE_SUBMIT_INFO };
+	submitInfo.commandBufferCount = 1;
+	submitInfo.pCommandBuffers = &cmdBuf;
+	// NOTE: Depending on which thread this is called from, this can itself be a violation (queueing stuff from a different thread
+	// on a queue used by another thread).
+	VkResult retval = vkQueueSubmit(gfx_queue_, 1, &submitInfo, VK_NULL_HANDLE);
+	// We might not actually lose the device immediately, but good to confirm.
+	NOTICE_LOG(G3D, "Tried to lose the device, vkQueueSubmit retval = %s", VulkanResultToString(retval));
+	// At this point, the device should be lost.
+}
+
 void TransitionImageLayout2(VkCommandBuffer cmd, VkImage image, int baseMip, int numMipLevels, int numLayers, VkImageAspectFlags aspectMask,
 	VkImageLayout oldImageLayout, VkImageLayout newImageLayout,
 	VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask,
@@ -1712,27 +1756,6 @@ void VulkanDeleteList::PerformDeletes(VulkanContext *vulkan, VmaAllocator alloca
 	queryPools_.clear();
 }
 
-void VulkanContext::GetImageMemoryRequirements(VkImage image, VkMemoryRequirements *mem_reqs, bool *dedicatedAllocation) {
-	if (Extensions().KHR_dedicated_allocation) {
-		VkImageMemoryRequirementsInfo2KHR memReqInfo2{VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR};
-		memReqInfo2.image = image;
-
-		VkMemoryRequirements2KHR memReq2 = {VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR};
-		VkMemoryDedicatedRequirementsKHR memDedicatedReq{VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR};
-		memReq2.pNext = &memDedicatedReq;
-
-		vkGetImageMemoryRequirements2KHR(GetDevice(), &memReqInfo2, &memReq2);
-
-		*mem_reqs = memReq2.memoryRequirements;
-		*dedicatedAllocation =
-			(memDedicatedReq.requiresDedicatedAllocation != VK_FALSE) ||
-			(memDedicatedReq.prefersDedicatedAllocation != VK_FALSE);
-	} else {
-		vkGetImageMemoryRequirements(GetDevice(), image, mem_reqs);
-		*dedicatedAllocation = false;
-	}
-}
-
 bool IsHashMaliDriverVersion(const VkPhysicalDeviceProperties &props) {
 	// ARM used to put a hash in place of the driver version.
 	// Now they only use major versions. We'll just make a bad heuristic.
diff --git a/Common/GPU/Vulkan/VulkanContext.h b/Common/GPU/Vulkan/VulkanContext.h
index de760b934b6a..edc1fca22120 100644
--- a/Common/GPU/Vulkan/VulkanContext.h
+++ b/Common/GPU/Vulkan/VulkanContext.h
@@ -365,8 +365,6 @@ class VulkanContext {
 		return devicePerfClass_;
 	}
 
-	void GetImageMemoryRequirements(VkImage image, VkMemoryRequirements *mem_reqs, bool *dedicatedAllocation);
-
 	VmaAllocator Allocator() const {
 		return allocator_;
 	}
@@ -383,6 +381,10 @@ class VulkanContext {
 		return availablePresentModes_;
 	}
 
+	// Forces a device loss, to help debug device recovery.
+	// It'll create its own command buffer for this.
+	void IntentionallyLoseDevice();
+
 private:
 	bool ChooseQueue();
 
@@ -476,6 +478,8 @@ class VulkanContext {
 	std::vector<VkCommandBuffer> cmdQueue_;
 
 	VmaAllocator allocator_ = VK_NULL_HANDLE;
+
+	bool deviceLost_ = false;
 };
 
 // Detailed control.
diff --git a/Common/GPU/Vulkan/VulkanLoader.cpp b/Common/GPU/Vulkan/VulkanLoader.cpp
index 23f664ba977b..538eeaf2588a 100644
--- a/Common/GPU/Vulkan/VulkanLoader.cpp
+++ b/Common/GPU/Vulkan/VulkanLoader.cpp
@@ -742,6 +742,7 @@ void VulkanFree() {
 const char *VulkanResultToString(VkResult res) {
 	static char temp[128]{};
 	switch (res) {
+	case VK_SUCCESS: return "VK_SUCCESS";
 	case VK_NOT_READY: return "VK_NOT_READY";
 	case VK_TIMEOUT: return "VK_TIMEOUT";
 	case VK_EVENT_SET: return "VK_EVENT_SET";
diff --git a/Common/GPU/Vulkan/thin3d_vulkan.cpp b/Common/GPU/Vulkan/thin3d_vulkan.cpp
index bc7d9ba700cd..a5edda73ba94 100644
--- a/Common/GPU/Vulkan/thin3d_vulkan.cpp
+++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp
@@ -408,6 +408,10 @@ class VKContext : public DrawContext {
 		}
 	}
 
+	void IntentionallyLoseDevice() override {
+		vulkan_->IntentionallyLoseDevice();
+	}
+
 	DepthStencilState *CreateDepthStencilState(const DepthStencilStateDesc &desc) override;
 	BlendState *CreateBlendState(const BlendStateDesc &desc) override;
 	InputLayout *CreateInputLayout(const InputLayoutDesc &desc) override;
diff --git a/Common/GPU/thin3d.h b/Common/GPU/thin3d.h
index b32fbb3cbaba..c492d1c98b7a 100644
--- a/Common/GPU/thin3d.h
+++ b/Common/GPU/thin3d.h
@@ -834,6 +834,8 @@ class DrawContext {
 	// Not very elegant, but more elegant than the old passId hack.
 	virtual void SetInvalidationCallback(InvalidationCallback callback) = 0;
 
+	virtual void IntentionallyLoseDevice() {}
+
 protected:
 	ShaderModule *vsPresets_[VS_MAX_PRESET];
 	ShaderModule *fsPresets_[FS_MAX_PRESET];
diff --git a/UI/DevScreens.cpp b/UI/DevScreens.cpp
index 36aaf564f7ee..0b178cc62b0c 100644
--- a/UI/DevScreens.cpp
+++ b/UI/DevScreens.cpp
@@ -123,6 +123,14 @@ void DevMenuScreen::CreatePopupContents(UI::ViewGroup *parent) {
 	items->Add(new CheckBox(&g_Config.bDrawFrameGraph, dev->T("Draw Frametimes Graph")));
 	items->Add(new Choice(dev->T("Reset limited logging")))->OnClick.Handle(this, &DevMenuScreen::OnResetLimitedLogging);
 
+	if (g_Config.iGPUBackend == (int)GPUBackend::VULKAN) {
+		items->Add(new Choice(dev->T("Crash GPU")))->OnClick.Add([&](UI::EventParams &) {
+			Draw::DrawContext *draw = screenManager()->getDrawContext();
+			draw->IntentionallyLoseDevice();
+			return UI::EVENT_DONE;
+		});
+	}
+
 	scroll->Add(items);
 	parent->Add(scroll);
 

From f33776bc4a4d1868399f4f24c7b9b40b5acc1d60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Mon, 29 May 2023 16:02:53 +0200
Subject: [PATCH 2/4] Fix a very tiny memory leak

---
 Common/GPU/Vulkan/VulkanRenderManager.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Common/GPU/Vulkan/VulkanRenderManager.cpp b/Common/GPU/Vulkan/VulkanRenderManager.cpp
index 287eaf236575..c1bde6d52570 100644
--- a/Common/GPU/Vulkan/VulkanRenderManager.cpp
+++ b/Common/GPU/Vulkan/VulkanRenderManager.cpp
@@ -507,6 +507,8 @@ void VulkanRenderManager::ThreadFunc() {
 		// push more work when it feels like it, and just start working.
 		if (task->runType == VKRRunType::EXIT) {
 			// Oh, host wanted out. Let's leave.
+			delete task;
+			// In this case, there should be no more tasks.
 			break;
 		}
 

From 485131a29cc85df7c7cb90b82fcda18cc04de6ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Mon, 29 May 2023 18:17:03 +0200
Subject: [PATCH 3/4] Win32: Plumb through an attempt at handling Vulkan device
 lost.

Unfortunately, recreating the device fails on NV.
---
 Common/GPU/D3D11/thin3d_d3d11.cpp         |  5 +-
 Common/GPU/OpenGL/thin3d_gl.cpp           |  5 +-
 Common/GPU/Vulkan/VulkanContext.cpp       |  3 +-
 Common/GPU/Vulkan/VulkanDebug.cpp         |  8 +++
 Common/GPU/Vulkan/VulkanFrameData.cpp     |  7 ++-
 Common/GPU/Vulkan/VulkanFrameData.h       |  3 ++
 Common/GPU/Vulkan/VulkanQueueRunner.cpp   |  4 ++
 Common/GPU/Vulkan/VulkanRenderManager.cpp | 63 +++++++++++++++++++++--
 Common/GPU/Vulkan/VulkanRenderManager.h   | 10 +++-
 Common/GPU/Vulkan/thin3d_vulkan.cpp       | 11 ++--
 Common/GPU/thin3d.h                       |  2 +-
 Common/GraphicsContext.h                  |  4 ++
 Common/UI/Screen.cpp                      | 24 +++++----
 Common/UI/Screen.h                        |  2 +-
 Common/UI/UIScreen.cpp                    |  9 ++--
 Common/UI/UIScreen.h                      |  2 +-
 Core/Core.cpp                             | 15 +++++-
 UI/EmuScreen.cpp                          |  7 ++-
 UI/EmuScreen.h                            |  2 +-
 UI/NativeApp.cpp                          |  1 +
 Windows/EmuThread.cpp                     | 32 ++++++++++--
 Windows/GPU/WindowsVulkanContext.cpp      |  5 ++
 Windows/GPU/WindowsVulkanContext.h        |  1 +
 23 files changed, 183 insertions(+), 42 deletions(-)

diff --git a/Common/GPU/D3D11/thin3d_d3d11.cpp b/Common/GPU/D3D11/thin3d_d3d11.cpp
index 4a90c318c1f7..7c1dbbbc4bb7 100644
--- a/Common/GPU/D3D11/thin3d_d3d11.cpp
+++ b/Common/GPU/D3D11/thin3d_d3d11.cpp
@@ -137,7 +137,7 @@ class D3D11DrawContext : public DrawContext {
 	void DrawUP(const void *vdata, int vertexCount) override;
 	void Clear(int mask, uint32_t colorval, float depthVal, int stencilVal) override;
 
-	void BeginFrame() override;
+	bool BeginFrame() override;
 
 	std::string GetInfoString(InfoField info) const override {
 		switch (info) {
@@ -1448,7 +1448,7 @@ void D3D11DrawContext::Clear(int mask, uint32_t colorval, float depthVal, int st
 	}
 }
 
-void D3D11DrawContext::BeginFrame() {
+bool D3D11DrawContext::BeginFrame() {
 	context_->OMSetRenderTargets(1, &curRenderTargetView_, curDepthStencilView_);
 
 	if (curBlend_ != nullptr) {
@@ -1475,6 +1475,7 @@ void D3D11DrawContext::BeginFrame() {
 			context_->PSSetConstantBuffers(0, 1, &curPipeline_->dynamicUniforms);
 		}
 	}
+	return true;
 }
 
 void D3D11DrawContext::CopyFramebufferImage(Framebuffer *srcfb, int level, int x, int y, int z, Framebuffer *dstfb, int dstLevel, int dstX, int dstY, int dstZ, int width, int height, int depth, int channelBit, const char *tag) {
diff --git a/Common/GPU/OpenGL/thin3d_gl.cpp b/Common/GPU/OpenGL/thin3d_gl.cpp
index d4054987f006..4d8c01fa8604 100644
--- a/Common/GPU/OpenGL/thin3d_gl.cpp
+++ b/Common/GPU/OpenGL/thin3d_gl.cpp
@@ -367,7 +367,7 @@ class OpenGLContext : public DrawContext {
 	Buffer *CreateBuffer(size_t size, uint32_t usageFlags) override;
 	Framebuffer *CreateFramebuffer(const FramebufferDesc &desc) override;
 
-	void BeginFrame() override;
+	bool BeginFrame() override;
 	void EndFrame() override;
 
 	void UpdateBuffer(Buffer *buffer, const uint8_t *data, size_t offset, size_t size, UpdateBufferFlags flags) override;
@@ -782,10 +782,11 @@ OpenGLContext::~OpenGLContext() {
 	}
 }
 
-void OpenGLContext::BeginFrame() {
+bool OpenGLContext::BeginFrame() {
 	renderManager_.BeginFrame(debugFlags_ & DebugFlags::PROFILE_TIMESTAMPS);
 	FrameData &frameData = frameData_[renderManager_.GetCurFrame()];
 	renderManager_.BeginPushBuffer(frameData.push);
+	return true;
 }
 
 void OpenGLContext::EndFrame() {
diff --git a/Common/GPU/Vulkan/VulkanContext.cpp b/Common/GPU/Vulkan/VulkanContext.cpp
index 3f4e6a85a7ec..47b6152a4215 100644
--- a/Common/GPU/Vulkan/VulkanContext.cpp
+++ b/Common/GPU/Vulkan/VulkanContext.cpp
@@ -702,7 +702,8 @@ VkResult VulkanContext::CreateDevice() {
 	VkResult res = vkCreateDevice(physical_devices_[physical_device_], &device_info, nullptr, &device_);
 	if (res != VK_SUCCESS) {
 		init_error_ = "Unable to create Vulkan device";
-		ERROR_LOG(G3D, "Unable to create Vulkan device");
+		ERROR_LOG(G3D, "Unable to create Vulkan device: '%s'", VulkanResultToString(res));
+		return res;
 	} else {
 		VulkanLoadDeviceFunctions(device_, extensionsLookup_);
 	}
diff --git a/Common/GPU/Vulkan/VulkanDebug.cpp b/Common/GPU/Vulkan/VulkanDebug.cpp
index 00a226897413..e3972d0288c9 100644
--- a/Common/GPU/Vulkan/VulkanDebug.cpp
+++ b/Common/GPU/Vulkan/VulkanDebug.cpp
@@ -76,6 +76,14 @@ VKAPI_ATTR VkBool32 VKAPI_CALL VulkanDebugUtilsCallback(
 		// Extended validation (ARM best practices)
 		// Non-fifo validation not recommended
 		return false;
+
+	// These get triggered during the device lost simulation. Ignore.
+	case -556648736:
+	case 1812873262:
+	case 337425955:
+		WARN_LOG(G3D, "Validation message %d typical of device lost simulation, ignoring.", messageCode);
+		return false;
+
 	default:
 		break;
 	}
diff --git a/Common/GPU/Vulkan/VulkanFrameData.cpp b/Common/GPU/Vulkan/VulkanFrameData.cpp
index 90d2c434839c..1a1e221ccc37 100644
--- a/Common/GPU/Vulkan/VulkanFrameData.cpp
+++ b/Common/GPU/Vulkan/VulkanFrameData.cpp
@@ -93,6 +93,7 @@ void FrameData::AcquireNextImage(VulkanContext *vulkan, FrameDataShared &shared)
 
 VkResult FrameData::QueuePresent(VulkanContext *vulkan, FrameDataShared &shared) {
 	_dbg_assert_(hasAcquired);
+	_dbg_assert_(!deviceLost);
 	hasAcquired = false;
 	_dbg_assert_(!skipSwap);
 
@@ -132,6 +133,8 @@ VkCommandBuffer FrameData::GetInitCmd(VulkanContext *vulkan) {
 }
 
 void FrameData::SubmitPending(VulkanContext *vulkan, FrameSubmitType type, FrameDataShared &sharedData) {
+	_dbg_assert_(!deviceLost);
+
 	VkCommandBuffer cmdBufs[3];
 	int numCmdBufs = 0;
 
@@ -206,7 +209,9 @@ void FrameData::SubmitPending(VulkanContext *vulkan, FrameSubmitType type, Frame
 	}
 
 	if (res == VK_ERROR_DEVICE_LOST) {
-		_assert_msg_(false, "Lost the Vulkan device in vkQueueSubmit! If this happens again, switch Graphics Backend away from Vulkan");
+		ERROR_LOG(G3D, "Lost the Vulkan device in vkQueueSubmit! If this happens again, switch Graphics Backend away from Vulkan");
+		deviceLost = true;
+		return;
 	} else {
 		_assert_msg_(res == VK_SUCCESS, "vkQueueSubmit failed (main)! result=%s", VulkanResultToString(res));
 	}
diff --git a/Common/GPU/Vulkan/VulkanFrameData.h b/Common/GPU/Vulkan/VulkanFrameData.h
index 0e1344f24e85..93fe7ad4f214 100644
--- a/Common/GPU/Vulkan/VulkanFrameData.h
+++ b/Common/GPU/Vulkan/VulkanFrameData.h
@@ -88,6 +88,9 @@ struct FrameData {
 
 	bool syncDone = false;
 
+	// Set if the device was just lost.
+	bool deviceLost = false;
+
 	// Swapchain.
 	uint32_t curSwapchainImage = -1;
 
diff --git a/Common/GPU/Vulkan/VulkanQueueRunner.cpp b/Common/GPU/Vulkan/VulkanQueueRunner.cpp
index e7670e99445f..5abd03693890 100644
--- a/Common/GPU/Vulkan/VulkanQueueRunner.cpp
+++ b/Common/GPU/Vulkan/VulkanQueueRunner.cpp
@@ -395,6 +395,9 @@ void VulkanQueueRunner::RunSteps(std::vector<VKRStep *> &steps, FrameData &frame
 					vkCmdEndDebugUtilsLabelEXT(cmd);
 				}
 				frameData.SubmitPending(vulkan_, FrameSubmitType::Pending, frameDataShared);
+				if (frameData.deviceLost) {
+					goto bail;
+				}
 
 				// When stepping in the GE debugger, we can end up here multiple times in a "frame".
 				// So only acquire once.
@@ -447,6 +450,7 @@ void VulkanQueueRunner::RunSteps(std::vector<VKRStep *> &steps, FrameData &frame
 		}
 	}
 
+bail:
 	// Deleting all in one go should be easier on the instruction cache than deleting
 	// them as we go - and easier to debug because we can look backwards in the frame.
 	if (!keepSteps) {
diff --git a/Common/GPU/Vulkan/VulkanRenderManager.cpp b/Common/GPU/Vulkan/VulkanRenderManager.cpp
index c1bde6d52570..af5e625894df 100644
--- a/Common/GPU/Vulkan/VulkanRenderManager.cpp
+++ b/Common/GPU/Vulkan/VulkanRenderManager.cpp
@@ -492,6 +492,9 @@ void VulkanRenderManager::DrainCompileQueue() {
 void VulkanRenderManager::ThreadFunc() {
 	SetCurrentThreadName("RenderMan");
 	while (true) {
+		if (deviceLost_) {
+			break;
+		}
 		// Pop a task of the queue and execute it.
 		VKRRenderThreadTask *task = nullptr;
 		{
@@ -503,6 +506,12 @@ void VulkanRenderManager::ThreadFunc() {
 			renderThreadQueue_.pop();
 		}
 
+		if (deviceLost_) {
+			delete task;
+			// We'll clear out the rest after the break.
+			break;
+		}
+
 		// Oh, we got a task! We can now have pushMutex_ unlocked, allowing the host to
 		// push more work when it feels like it, and just start working.
 		if (task->runType == VKRRunType::EXIT) {
@@ -516,6 +525,16 @@ void VulkanRenderManager::ThreadFunc() {
 		delete task;
 	}
 
+	{
+		// Make sure nothing is left.
+		std::unique_lock<std::mutex> lock(pushMutex_);
+		while (!renderThreadQueue_.empty()) {
+			VKRRenderThreadTask *task = renderThreadQueue_.front();
+			renderThreadQueue_.pop();
+			delete task;
+		}
+	}
+
 	// Wait for the device to be done with everything, before tearing stuff down.
 	// TODO: Do we need this?
 	vkDeviceWaitIdle(vulkan_->GetDevice());
@@ -523,8 +542,11 @@ void VulkanRenderManager::ThreadFunc() {
 	VLOG("PULL: Quitting");
 }
 
-void VulkanRenderManager::BeginFrame(bool enableProfiling, bool enableLogProfiler) {
+bool VulkanRenderManager::BeginFrame(bool enableProfiling, bool enableLogProfiler) {
 	VLOG("BeginFrame");
+	if (deviceLost_) {
+		return false;
+	}
 	VkDevice device = vulkan_->GetDevice();
 
 	int curFrame = vulkan_->GetCurFrame();
@@ -545,8 +567,15 @@ void VulkanRenderManager::BeginFrame(bool enableProfiling, bool enableLogProfile
 	// This must be the very first Vulkan call we do in a new frame.
 	// Makes sure the very last command buffer from the frame before the previous has been fully executed.
 	if (vkWaitForFences(device, 1, &frameData.fence, true, UINT64_MAX) == VK_ERROR_DEVICE_LOST) {
-		_assert_msg_(false, "Device lost in vkWaitForFences");
+		ERROR_LOG(G3D, "Device lost in vkWaitForFences");
+		frameData.deviceLost = true;
+		deviceLost_ = true;
+		// If the render thread is waiting for an event that won't come, kick it loose.
+		pushCondVar_.notify_one();
+		frameData.readyForFence = true;
+		return false;
 	}
+
 	vkResetFences(device, 1, &frameData.fence);
 
 	int validBits = vulkan_->GetQueueFamilyProperties(vulkan_->GetGraphicsQueueFamilyIndex()).timestampValidBits;
@@ -616,6 +645,7 @@ void VulkanRenderManager::BeginFrame(bool enableProfiling, bool enableLogProfile
 		frameData.profile.timestampDescriptions.push_back("initCmd");
 		VkCommandBuffer initCmd = GetInitCmd();
 	}
+	return true;
 }
 
 VkCommandBuffer VulkanRenderManager::GetInitCmd() {
@@ -1280,6 +1310,10 @@ void VulkanRenderManager::Finish() {
 	steps_.clear();
 	vulkan_->EndFrame();
 	insideFrame_ = false;
+
+	if (deviceLost_) {
+		WARN_LOG(G3D, "VulkanRenderManager::Finish: Device lost");
+	}
 }
 
 void VulkanRenderManager::Wipe() {
@@ -1292,11 +1326,16 @@ void VulkanRenderManager::Wipe() {
 // Called on the render thread.
 //
 // Can be called again after a VKRRunType::SYNC on the same frame.
-void VulkanRenderManager::Run(VKRRenderThreadTask &task) {
+bool VulkanRenderManager::Run(VKRRenderThreadTask &task) {
+	_dbg_assert_(!deviceLost_);
+
 	FrameData &frameData = frameData_[task.frame];
 
 	_dbg_assert_(!frameData.hasPresentCommands);
 	frameData.SubmitPending(vulkan_, FrameSubmitType::Pending, frameDataShared_);
+	if (frameData.deviceLost) {
+		return false;
+	}
 
 	if (!frameData.hasMainCommands) {
 		// Effectively resets both main and present command buffers, since they both live in this pool.
@@ -1312,8 +1351,9 @@ void VulkanRenderManager::Run(VKRRenderThreadTask &task) {
 
 	queueRunner_.PreprocessSteps(task.steps);
 	// Likely during shutdown, happens in headless.
-	if (task.steps.empty() && !frameData.hasAcquired)
+	if (task.steps.empty() && !frameData.hasAcquired) {
 		frameData.skipSwap = true;
+	}
 	//queueRunner_.LogSteps(stepsOnThread, false);
 	if (IsVREnabled()) {
 		int passes = GetVRPassesCount();
@@ -1326,10 +1366,18 @@ void VulkanRenderManager::Run(VKRRenderThreadTask &task) {
 		queueRunner_.RunSteps(task.steps, frameData, frameDataShared_);
 	}
 
+	if (frameData.deviceLost) {
+		deviceLost_ = true;
+		return false;
+	}
+
 	switch (task.runType) {
 	case VKRRunType::PRESENT:
 		frameData.SubmitPending(vulkan_, FrameSubmitType::Present, frameDataShared_);
-
+		if (frameData.deviceLost) {
+			deviceLost_ = true;
+			return false;
+		}
 		if (!frameData.skipSwap) {
 			VkResult res = frameData.QueuePresent(vulkan_, frameDataShared_);
 			if (res == VK_ERROR_OUT_OF_DATE_KHR) {
@@ -1369,7 +1417,12 @@ void VulkanRenderManager::Run(VKRRenderThreadTask &task) {
 		_dbg_assert_(false);
 	}
 
+	if (frameData.deviceLost) {
+		deviceLost_ = true;
+	}
 	VLOG("PULL: Finished running frame %d", task.frame);
+
+	return !deviceLost_;
 }
 
 // Called from main thread.
diff --git a/Common/GPU/Vulkan/VulkanRenderManager.h b/Common/GPU/Vulkan/VulkanRenderManager.h
index 58ef8b5d0b84..232a6c9fe9f2 100644
--- a/Common/GPU/Vulkan/VulkanRenderManager.h
+++ b/Common/GPU/Vulkan/VulkanRenderManager.h
@@ -185,7 +185,8 @@ class VulkanRenderManager {
 	~VulkanRenderManager();
 
 	// Makes sure that the GPU has caught up enough that we can start writing buffers of this frame again.
-	void BeginFrame(bool enableProfiling, bool enableLogProfiler);
+	// A false return value means that the device is lost, and we should just try to end the frame ASAP without doing anything.
+	bool BeginFrame(bool enableProfiling, bool enableLogProfiler);
 	// Can run on a different thread!
 	void Finish();
 	// Zaps queued up commands. Use if you know there's a risk you've queued up stuff that has already been deleted. Can happen during in-game shutdown.
@@ -461,13 +462,16 @@ class VulkanRenderManager {
 	void ResetStats();
 	void DrainCompileQueue();
 
+	bool DeviceIsLost() const { return deviceLost_; }
+
 private:
 	void EndCurRenderStep();
 
 	void ThreadFunc();
 	void CompileThreadFunc();
 
-	void Run(VKRRenderThreadTask &task);
+	// Fails if the device was lost.
+	bool Run(VKRRenderThreadTask &task);
 
 	// Bad for performance but sometimes necessary for synchronous CPU readbacks (screenshots and whatnot).
 	void FlushSync();
@@ -481,6 +485,8 @@ class VulkanRenderManager {
 
 	int outOfDateFrames_ = 0;
 
+	bool deviceLost_ = false;
+
 	// Submission time state
 
 	// Note: These are raw backbuffer-sized. Rotated.
diff --git a/Common/GPU/Vulkan/thin3d_vulkan.cpp b/Common/GPU/Vulkan/thin3d_vulkan.cpp
index a5edda73ba94..42d0d4a24797 100644
--- a/Common/GPU/Vulkan/thin3d_vulkan.cpp
+++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp
@@ -476,7 +476,7 @@ class VKContext : public DrawContext {
 
 	void Clear(int mask, uint32_t colorval, float depthVal, int stencilVal) override;
 
-	void BeginFrame() override;
+	bool BeginFrame() override;
 	void EndFrame() override;
 	void WipeQueue() override;
 
@@ -1072,15 +1072,18 @@ VKContext::~VKContext() {
 	vulkan_->Delete().QueueDeletePipelineCache(pipelineCache_);
 }
 
-void VKContext::BeginFrame() {
-	// TODO: Bad dependency on g_Config here!
-	renderManager_.BeginFrame(debugFlags_ & DebugFlags::PROFILE_TIMESTAMPS, debugFlags_ & DebugFlags::PROFILE_SCOPES);
+bool VKContext::BeginFrame() {
+	if (!renderManager_.BeginFrame(debugFlags_ & DebugFlags::PROFILE_TIMESTAMPS, debugFlags_ & DebugFlags::PROFILE_SCOPES)) {
+		// Something failed badly, let's bail.
+		return false;
+	}
 
 	FrameData &frame = frame_[vulkan_->GetCurFrame()];
 
 	push_->BeginFrame();
 
 	frame.descriptorPool.Reset();
+	return true;
 }
 
 void VKContext::EndFrame() {
diff --git a/Common/GPU/thin3d.h b/Common/GPU/thin3d.h
index c492d1c98b7a..29864010629c 100644
--- a/Common/GPU/thin3d.h
+++ b/Common/GPU/thin3d.h
@@ -805,7 +805,7 @@ class DrawContext {
 	virtual void DrawUP(const void *vdata, int vertexCount) = 0;
 	
 	// Frame management (for the purposes of sync and resource management, necessary with modern APIs). Default implementations here.
-	virtual void BeginFrame() {}
+	virtual bool BeginFrame() { return true; }
 	virtual void EndFrame() = 0;
 	virtual void WipeQueue() {}
 
diff --git a/Common/GraphicsContext.h b/Common/GraphicsContext.h
index 873722764c24..945067b5f091 100644
--- a/Common/GraphicsContext.h
+++ b/Common/GraphicsContext.h
@@ -38,5 +38,9 @@ class GraphicsContext {
 	// Should strive to get rid of these.
 	virtual void Poll() {}
 
+	virtual bool DeviceIsLost() const {
+		return false;
+	}
+
 	virtual Draw::DrawContext *GetDrawContext() = 0;
 };
diff --git a/Common/UI/Screen.cpp b/Common/UI/Screen.cpp
index 0eeaa66214e9..05602533f642 100644
--- a/Common/UI/Screen.cpp
+++ b/Common/UI/Screen.cpp
@@ -165,21 +165,23 @@ void ScreenManager::render() {
 
 				// TODO: Make really sure that this "mismatched" pre/post only happens
 				// when screens are "compatible" (both are UIScreens, for example).
-				backback.screen->preRender();
-				backback.screen->render();
+				if (backback.screen->preRender()) {
+					backback.screen->render();
+					stack_.back().screen->render();
+					if (postRenderCb_)
+						postRenderCb_(getUIContext(), postRenderUserdata_);
+					backback.screen->postRender();
+					break;
+				}
+			}
+		default:
+			_assert_(stack_.back().screen);
+			if (stack_.back().screen->preRender()) {
 				stack_.back().screen->render();
 				if (postRenderCb_)
 					postRenderCb_(getUIContext(), postRenderUserdata_);
-				backback.screen->postRender();
-				break;
+				stack_.back().screen->postRender();
 			}
-		default:
-			_assert_(stack_.back().screen);
-			stack_.back().screen->preRender();
-			stack_.back().screen->render();
-			if (postRenderCb_)
-				postRenderCb_(getUIContext(), postRenderUserdata_);
-			stack_.back().screen->postRender();
 			break;
 		}
 	} else {
diff --git a/Common/UI/Screen.h b/Common/UI/Screen.h
index fca213fdb814..9cd730f30041 100644
--- a/Common/UI/Screen.h
+++ b/Common/UI/Screen.h
@@ -50,7 +50,7 @@ class Screen {
 
 	virtual void onFinish(DialogResult reason) {}
 	virtual void update() {}
-	virtual void preRender() {}
+	virtual bool preRender() { return true; }  // If this returns false, something is really bad and we should try to skip the rest of the frame, the error will be handled at the end.
 	virtual void render() {}
 	virtual void postRender() {}
 	virtual void resized() {}
diff --git a/Common/UI/UIScreen.cpp b/Common/UI/UIScreen.cpp
index 3da8425421a9..719a9b593097 100644
--- a/Common/UI/UIScreen.cpp
+++ b/Common/UI/UIScreen.cpp
@@ -189,13 +189,15 @@ void UIScreen::deviceRestored() {
 		root_->DeviceRestored(screenManager()->getDrawContext());
 }
 
-void UIScreen::preRender() {
+bool UIScreen::preRender() {
 	using namespace Draw;
 	Draw::DrawContext *draw = screenManager()->getDrawContext();
 	if (!draw) {
-		return;
+		return true;
+	}
+	if (!draw->BeginFrame()) {
+		return false;
 	}
-	draw->BeginFrame();
 	// Bind and clear the back buffer
 	draw->BindFramebufferAsRenderTarget(nullptr, { RPAction::CLEAR, RPAction::CLEAR, RPAction::CLEAR, 0xFF000000 }, "UI");
 	screenManager()->getUIContext()->BeginFrame();
@@ -209,6 +211,7 @@ void UIScreen::preRender() {
 	viewport.MinDepth = 0.0;
 	draw->SetViewport(viewport);
 	draw->SetTargetSize(g_display.pixel_xres, g_display.pixel_yres);
+	return true;
 }
 
 void UIScreen::postRender() {
diff --git a/Common/UI/UIScreen.h b/Common/UI/UIScreen.h
index 60eb1749b044..a1f1e86ae407 100644
--- a/Common/UI/UIScreen.h
+++ b/Common/UI/UIScreen.h
@@ -36,7 +36,7 @@ class UIScreen : public Screen {
 	~UIScreen();
 
 	void update() override;
-	void preRender() override;
+	bool preRender() override;
 	void render() override;
 	void postRender() override;
 	void deviceLost() override;
diff --git a/Core/Core.cpp b/Core/Core.cpp
index daf858a30bfa..b77148441d2c 100644
--- a/Core/Core.cpp
+++ b/Core/Core.cpp
@@ -230,6 +230,10 @@ void Core_RunLoop(GraphicsContext *ctx) {
 		Core_StateProcessed();
 		double startTime = time_now_d();
 		UpdateRunLoop();
+		if (graphicsContext->DeviceIsLost()) {
+			// Let the outer loop take care of this.
+			return;
+		}
 
 		// Simple throttling to not burn the GPU in the menu.
 		double diffTime = time_now_d() - startTime;
@@ -243,6 +247,10 @@ void Core_RunLoop(GraphicsContext *ctx) {
 
 	while ((coreState == CORE_RUNNING || coreState == CORE_STEPPING) && GetUIState() == UISTATE_INGAME) {
 		UpdateRunLoop();
+		if (graphicsContext->DeviceIsLost()) {
+			// Let the outer loop take care of this.
+			break;
+		}
 		if (!windowHidden && !Core_IsStepping()) {
 			ctx->SwapBuffers();
 
@@ -337,6 +345,9 @@ bool Core_Run(GraphicsContext *ctx) {
 				return false;
 			}
 			Core_RunLoop(ctx);
+			if (ctx->DeviceIsLost()) {
+				return true;
+			}
 			continue;
 		}
 
@@ -345,6 +356,9 @@ bool Core_Run(GraphicsContext *ctx) {
 		case CORE_STEPPING:
 			// enter a fast runloop
 			Core_RunLoop(ctx);
+			if (ctx->DeviceIsLost()) {
+				return true;
+			}
 			if (coreState == CORE_POWERDOWN) {
 				Core_StateProcessed();
 				return true;
@@ -357,7 +371,6 @@ bool Core_Run(GraphicsContext *ctx) {
 		case CORE_RUNTIME_ERROR:
 			// Exit loop!!
 			Core_StateProcessed();
-
 			return true;
 
 		case CORE_NEXTFRAME:
diff --git a/UI/EmuScreen.cpp b/UI/EmuScreen.cpp
index 0d926f9531da..02f3facea09b 100644
--- a/UI/EmuScreen.cpp
+++ b/UI/EmuScreen.cpp
@@ -1414,10 +1414,12 @@ static void DrawFrameTimes(UIContext *ctx, const Bounds &bounds) {
 	ctx->RebindTexture();
 }
 
-void EmuScreen::preRender() {
+bool EmuScreen::preRender() {
 	using namespace Draw;
 	DrawContext *draw = screenManager()->getDrawContext();
-	draw->BeginFrame();
+	if (!draw->BeginFrame()) {
+		return false;
+	}
 	// Here we do NOT bind the backbuffer or clear the screen, unless non-buffered.
 	// The emuscreen is different than the others - we really want to allow the game to render to framebuffers
 	// before we ever bind the backbuffer for rendering. On mobile GPUs, switching back and forth between render
@@ -1443,6 +1445,7 @@ void EmuScreen::preRender() {
 		draw->SetViewport(viewport);
 	}
 	draw->SetTargetSize(g_display.pixel_xres, g_display.pixel_yres);
+	return true;
 }
 
 void EmuScreen::postRender() {
diff --git a/UI/EmuScreen.h b/UI/EmuScreen.h
index 7eda3988d413..af0a4fb411b9 100644
--- a/UI/EmuScreen.h
+++ b/UI/EmuScreen.h
@@ -44,7 +44,7 @@ class EmuScreen : public UIScreen {
 
 	void update() override;
 	void render() override;
-	void preRender() override;
+	bool preRender() override;
 	void postRender() override;
 	void dialogFinished(const Screen *dialog, DialogResult result) override;
 	void sendMessage(const char *msg, const char *value) override;
diff --git a/UI/NativeApp.cpp b/UI/NativeApp.cpp
index f5f6b0e508f0..c90c4c4eeffb 100644
--- a/UI/NativeApp.cpp
+++ b/UI/NativeApp.cpp
@@ -1045,6 +1045,7 @@ void RenderOverlays(UIContext *dc, void *userdata) {
 void NativeRender(GraphicsContext *graphicsContext) {
 	_dbg_assert_(graphicsContext != nullptr);
 	_dbg_assert_(g_screenManager != nullptr);
+	_dbg_assert_(!graphicsContext->DeviceIsLost())
 
 	g_GameManager.Update();
 
diff --git a/Windows/EmuThread.cpp b/Windows/EmuThread.cpp
index 71af97e683f7..52f4c227f841 100644
--- a/Windows/EmuThread.cpp
+++ b/Windows/EmuThread.cpp
@@ -259,6 +259,7 @@ void MainThreadFunc() {
 
 		// No safe way out without graphics.
 		ExitProcess(1);
+		return;  // This return never executes, but helps the compiler.
 	}
 
 	GraphicsContext *graphicsContext = g_graphicsContext;
@@ -301,6 +302,28 @@ void MainThreadFunc() {
 			if (!Core_IsActive())
 				UpdateUIState(UISTATE_MENU);
 			Core_Run(g_graphicsContext);
+			if (g_graphicsContext->DeviceIsLost()) {
+				// Try to recreate the device here.
+				NativeShutdownGraphics();
+				graphicsContext->StopThread();
+				graphicsContext->ShutdownFromRenderThread();
+				delete graphicsContext;
+				graphicsContext = nullptr;
+
+				bool success = CreateGraphicsBackend(&error_string, &g_graphicsContext);
+				if (success) {
+					graphicsContext = g_graphicsContext;
+					// Main thread is the render thread.
+					success = g_graphicsContext->InitFromRenderThread(&error_string);
+				}
+				if (!success) {
+					ERROR_LOG(G3D, "Failed to recreate Vulkan device after device loss");
+					coreState = CORE_POWERDOWN;
+					break;
+				}
+				NativeInitGraphics(graphicsContext);
+			}
+
 			if (coreState == CORE_BOOT_ERROR) {
 				break;
 			}
@@ -329,11 +352,12 @@ void MainThreadFunc() {
 	if (!useEmuThread) {
 		NativeShutdownGraphics();
 	}
+	if (g_graphicsContext) {
+		g_graphicsContext->ThreadEnd();
+		g_graphicsContext->ShutdownFromRenderThread();
 
-	g_graphicsContext->ThreadEnd();
-	g_graphicsContext->ShutdownFromRenderThread();
-
-	g_graphicsContext->Shutdown();
+		g_graphicsContext->Shutdown();
+	}
 
 	UpdateConsolePosition();
 	NativeShutdown();
diff --git a/Windows/GPU/WindowsVulkanContext.cpp b/Windows/GPU/WindowsVulkanContext.cpp
index 818f398420c8..d51762c49516 100644
--- a/Windows/GPU/WindowsVulkanContext.cpp
+++ b/Windows/GPU/WindowsVulkanContext.cpp
@@ -119,6 +119,7 @@ bool WindowsVulkanContext::Init(HINSTANCE hInst, HWND hWnd, std::string *error_m
 	vulkan_->ChooseDevice(deviceNum);
 	if (vulkan_->CreateDevice() != VK_SUCCESS) {
 		*error_message = vulkan_->InitError();
+		vulkan_->DestroyInstance();
 		delete vulkan_;
 		vulkan_ = nullptr;
 		return false;
@@ -184,3 +185,7 @@ void WindowsVulkanContext::Poll() {
 void *WindowsVulkanContext::GetAPIContext() {
 	return vulkan_;
 }
+
+bool WindowsVulkanContext::DeviceIsLost() const {
+	return renderManager_->DeviceIsLost();
+}
diff --git a/Windows/GPU/WindowsVulkanContext.h b/Windows/GPU/WindowsVulkanContext.h
index 49e6613e3eb5..dfc4f5625970 100644
--- a/Windows/GPU/WindowsVulkanContext.h
+++ b/Windows/GPU/WindowsVulkanContext.h
@@ -35,6 +35,7 @@ class WindowsVulkanContext : public WindowsGraphicsContext {
 	void Poll() override;
 
 	void *GetAPIContext() override;
+	bool DeviceIsLost() const override;
 
 	Draw::DrawContext *GetDrawContext() override { return draw_; }
 private:

From 6df78568c854afdd557e777d4c9c0375c91d57de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Mon, 29 May 2023 18:32:37 +0200
Subject: [PATCH 4/4] Clear the shader cache

---
 GPU/Vulkan/ShaderManagerVulkan.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/GPU/Vulkan/ShaderManagerVulkan.cpp b/GPU/Vulkan/ShaderManagerVulkan.cpp
index fcae89a8ea41..9a781c2217a4 100644
--- a/GPU/Vulkan/ShaderManagerVulkan.cpp
+++ b/GPU/Vulkan/ShaderManagerVulkan.cpp
@@ -237,6 +237,7 @@ ShaderManagerVulkan::~ShaderManagerVulkan() {
 }
 
 void ShaderManagerVulkan::DeviceLost() {
+	Clear();  // We only really need to do this if the device is actually lost, so DeviceLost might need an argument.
 	draw_ = nullptr;
 }