Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Depth raster: Try rasterization on a background thread #19792

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Common/GPU/Vulkan/VulkanRenderManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ using namespace PPSSPP_VK;

// renderPass is an example of the "compatibility class" or RenderPassType type.
bool VKRGraphicsPipeline::Create(VulkanContext *vulkan, VkRenderPass compatibleRenderPass, RenderPassType rpType, VkSampleCountFlagBits sampleCount, double scheduleTime, int countToCompile) {
_dbg_assert_(desc);
// Good torture test to test the shutdown-while-precompiling-shaders issue on PC where it's normally
// hard to catch because shaders compile so fast.
// sleep_ms(200);
Expand Down
10 changes: 9 additions & 1 deletion Core/Config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,15 @@ const char *DefaultLangRegion() {
}

static int DefaultDepthRaster() {
#if PPSSPP_PLATFORM(ANDROID) || PPSSPP_PLATFORM(IOS)
// All single cores default to off.
if (cpu_info.num_cores == 1) {
return (int)DepthRasterMode::OFF;
}

// ARMv7 also defaults to off.
#if PPSSPP_PLATFORM(ARM)
return (int)DepthRasterMode::OFF;
#elif PPSSPP_PLATFORM(ANDROID) || PPSSPP_PLATFORM(IOS)
return (int)DepthRasterMode::LOW_QUALITY;
#else
return (int)DepthRasterMode::DEFAULT;
Expand Down
221 changes: 153 additions & 68 deletions GPU/Common/DrawEngineCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "Common/LogReporting.h"
#include "Common/Math/SIMDHeaders.h"
#include "Common/Math/CrossSIMD.h"
#include "Common/Thread/ThreadUtil.h"
#include "Common/Math/lin/matrix4x4.h"
#include "Common/TimeUtil.h"
#include "Core/System.h"
Expand Down Expand Up @@ -54,7 +55,7 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) {
transformedExpanded_ = (TransformedVertex *)AllocateMemoryPages(3 * TRANSFORMED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
decoded_ = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
decIndex_ = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
indexGen.Setup(decIndex_);
indexGen_.Setup(decIndex_);

switch ((DepthRasterMode)g_Config.iDepthRasterMode) {
case DepthRasterMode::DEFAULT:
Expand All @@ -69,6 +70,7 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) {
}
if (useDepthRaster_) {
depthDraws_.reserve(256);
depthThread_ = std::thread([this]() { DepthThreadFunc(); });
}
}

Expand All @@ -87,6 +89,11 @@ DrawEngineCommon::~DrawEngineCommon() {
delete decoder;
});
ClearSplineBezierWeights();
if (depthThread_.joinable()) {
exitDepthThread_ = true;
depthEnqueueCond_.notify_one();
depthThread_.join();
}
}

void DrawEngineCommon::Init() {
Expand Down Expand Up @@ -677,7 +684,7 @@ int DrawEngineCommon::ExtendNonIndexedPrim(const uint32_t *cmd, const uint32_t *
}

void DrawEngineCommon::SkipPrim(GEPrimitiveType prim, int vertexCount, VertexDecoder *dec, u32 vertTypeID, int *bytesRead) {
if (!indexGen.PrimCompatible(prevPrim_, prim)) {
if (!indexGen_.PrimCompatible(prevPrim_, prim)) {
Flush();
}

Expand All @@ -697,7 +704,7 @@ void DrawEngineCommon::SkipPrim(GEPrimitiveType prim, int vertexCount, VertexDec

// vertTypeID is the vertex type but with the UVGen mode smashed into the top bits.
bool DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, VertexDecoder *dec, u32 vertTypeID, bool clockwise, int *bytesRead) {
if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawVerts_ >= MAX_DEFERRED_DRAW_VERTS || numDrawInds_ >= MAX_DEFERRED_DRAW_INDS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) {
if (!indexGen_.PrimCompatible(prevPrim_, prim) || numDrawVerts_ >= MAX_DEFERRED_DRAW_VERTS || numDrawInds_ >= MAX_DEFERRED_DRAW_INDS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) {
Flush();
}
_dbg_assert_(numDrawVerts_ < MAX_DEFERRED_DRAW_VERTS);
Expand Down Expand Up @@ -838,22 +845,22 @@ int DrawEngineCommon::DecodeInds() {
// 2. Loop through the drawcalls, translating indices as we go.
switch (di.indexType) {
case GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT:
indexGen.AddPrim(di.prim, di.vertexCount, indexOffset, clockwise);
indexGen_.AddPrim(di.prim, di.vertexCount, indexOffset, clockwise);
break;
case GE_VTYPE_IDX_8BIT >> GE_VTYPE_IDX_SHIFT:
indexGen.TranslatePrim(di.prim, di.vertexCount, (const u8 *)di.inds, indexOffset, clockwise);
indexGen_.TranslatePrim(di.prim, di.vertexCount, (const u8 *)di.inds, indexOffset, clockwise);
break;
case GE_VTYPE_IDX_16BIT >> GE_VTYPE_IDX_SHIFT:
indexGen.TranslatePrim(di.prim, di.vertexCount, (const u16_le *)di.inds, indexOffset, clockwise);
indexGen_.TranslatePrim(di.prim, di.vertexCount, (const u16_le *)di.inds, indexOffset, clockwise);
break;
case GE_VTYPE_IDX_32BIT >> GE_VTYPE_IDX_SHIFT:
indexGen.TranslatePrim(di.prim, di.vertexCount, (const u32_le *)di.inds, indexOffset, clockwise);
indexGen_.TranslatePrim(di.prim, di.vertexCount, (const u32_le *)di.inds, indexOffset, clockwise);
break;
}
}
decodeIndsCounter_ = i;

return indexGen.VertexCount();
return indexGen_.VertexCount();
}

bool DrawEngineCommon::CanUseHardwareTransform(int prim) const {
Expand Down Expand Up @@ -1001,6 +1008,142 @@ bool DrawEngineCommon::CalculateDepthDraw(DepthDraw *draw, GEPrimitiveType prim,
return true;
}

// TODO: Possibly split this in stages, to avoid switching back and forth between clipping and drawing.
void DrawEngineCommon::ProcessDepthDraw(const DepthDraw &draw) {
int *tx = depthScreenVerts_;
int *ty = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT;
float *tz = (float *)(depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2);

int outVertCount = 0;

const float *vertices = depthTransformed_ + 4 * draw.vertexOffset;
const uint16_t *indices = depthIndices_ + draw.indexOffset;

DepthScissor tileScissor = draw.scissor.Tile(0, 1);

const bool collectStats = coreCollectDebugStats;
const bool lowQ = g_Config.iDepthRasterMode == (int)DepthRasterMode::LOW_QUALITY;

{
TimeCollector collectStat(&gpuStats.msCullDepth, collectStats);
switch (draw.prim) {
case GE_PRIM_RECTANGLES:
outVertCount = DepthRasterClipIndexedRectangles(tx, ty, tz, vertices, indices, draw, tileScissor);
break;
case GE_PRIM_TRIANGLES:
outVertCount = DepthRasterClipIndexedTriangles(tx, ty, tz, vertices, indices, draw, tileScissor);
break;
default:
_dbg_assert_(false);
break;
}
}
{
TimeCollector collectStat(&gpuStats.msRasterizeDepth, collectStats);
DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(draw.depthAddr), draw.depthStride, tx, ty, tz, outVertCount, draw, tileScissor, lowQ);
}
}

void DrawEngineCommon::EnqueueDepthDraw(const DepthDraw &draw) {
std::lock_guard<std::mutex> lock(depthEnqueueMutex_);
if (depthDraws_.empty()) {
_dbg_assert_(curDraw_ == 0);
_dbg_assert_(!inDepthDrawPass_);
depthDraws_.push_back(draw);
depthRasterPassStart_ = time_now_d();
inDepthDrawPass_ = true;
depthEnqueueCond_.notify_one();
} else {
depthDraws_.push_back(draw);
depthEnqueueCond_.notify_one(); // In case the thread caught up.
}
}

// Returns true if we actually waited.
void DrawEngineCommon::WaitForDepthPassFinish() {
{
std::lock_guard<std::mutex> lock(depthEnqueueMutex_);
// In case the depth raster thread is idle, we need to nudge it.
_dbg_assert_(!finishedSubmitting_);
finishedSubmitting_ = true;
depthEnqueueCond_.notify_one();
}

// OK, we're in a pass. Wait for the thread to finish work.
std::unique_lock<std::mutex> lock(depthFinishMutex_);
while (!finishedDrawing_) {
depthFinishCond_.wait(lock);
}
}

void DrawEngineCommon::FlushQueuedDepth() {
if (depthRasterPassStart_ != 0.0) {
gpuStats.msRasterTimeAvailable += time_now_d() - depthRasterPassStart_;
depthRasterPassStart_ = 0.0;
}

if (inDepthDrawPass_) {
WaitForDepthPassFinish();
// At this point, we know that the depth thread is paused.

// Reset queue
depthIndexCount_ = 0;
depthVertexCount_ = 0;
depthDraws_.clear();
inDepthDrawPass_ = false;
finishedSubmitting_ = false;
finishedDrawing_ = false; // not sure if it matters who resets this.
curDraw_ = 0;
} else {
_dbg_assert_(curDraw_ == 0);
_dbg_assert_(!inDepthDrawPass_);
}
}

void DrawEngineCommon::DepthThreadFunc() {
SetCurrentThreadName("DepthRaster");

while (true) {
DepthDraw draw;
bool hasDraw = false;
// Wait for a draw or exit.
{
std::unique_lock<std::mutex> lock(depthEnqueueMutex_);
if (depthDraws_.size() == curDraw_) {
gpuStats.numDepthThreadCaughtUp++;
// We've drawn all we can. Let's check if we're finished.
// If we reach here, we've drawn everything we can. And if that's the last
// that will come in this batch, we notify.
if (finishedSubmitting_) {
gpuStats.numDepthThreadFinished++;
gpuStats.numDepthDraws += (int)depthDraws_.size();

// lock.unlock(); // possible optimization?
std::lock_guard<std::mutex> flock(depthFinishMutex_);
finishedDrawing_ = true;
depthFinishCond_.notify_one();
}

// OK, wait for something to do.
depthEnqueueCond_.wait(lock);
} else {
_dbg_assert_(curDraw_ < depthDraws_.size());
draw = depthDraws_[curDraw_++];
hasDraw = true;
}
if (exitDepthThread_) {
break;
}
// Then just loop around and wait for the next event.
}

// OK, now we *definitely* have a draw to process, and we're outside locks. Do it!
if (hasDraw) {
ProcessDepthDraw(draw);
}
}
}

void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount) {
if (!gstate.isModeClear() && (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled())) {
return;
Expand Down Expand Up @@ -1042,13 +1185,7 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder
depthIndexCount_ += vertexCount;
depthVertexCount_ += numDecoded;

if (depthDraws_.empty()) {
rasterTimeStart_ = time_now_d();
}

depthDraws_.push_back(draw);

// FlushQueuedDepth();
EnqueueDepthDraw(draw);
}

void DrawEngineCommon::DepthRasterPredecoded(GEPrimitiveType prim, const void *inVerts, int numDecoded, VertexDecoder *dec, int vertexCount) {
Expand Down Expand Up @@ -1083,57 +1220,5 @@ void DrawEngineCommon::DepthRasterPredecoded(GEPrimitiveType prim, const void *i
depthIndexCount_ += vertexCount;
depthVertexCount_ += numDecoded;

depthDraws_.push_back(draw);

if (depthDraws_.empty()) {
rasterTimeStart_ = time_now_d();
}
// FlushQueuedDepth();
}

void DrawEngineCommon::FlushQueuedDepth() {
if (rasterTimeStart_ != 0.0) {
gpuStats.msRasterTimeAvailable += time_now_d() - rasterTimeStart_;
rasterTimeStart_ = 0.0;
}

const bool collectStats = coreCollectDebugStats;
const bool lowQ = g_Config.iDepthRasterMode == (int)DepthRasterMode::LOW_QUALITY;

for (const auto &draw : depthDraws_) {
int *tx = depthScreenVerts_;
int *ty = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT;
float *tz = (float *)(depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2);

int outVertCount = 0;

const float *vertices = depthTransformed_ + 4 * draw.vertexOffset;
const uint16_t *indices = depthIndices_ + draw.indexOffset;

DepthScissor tileScissor = draw.scissor.Tile(0, 1);

{
TimeCollector collectStat(&gpuStats.msCullDepth, collectStats);
switch (draw.prim) {
case GE_PRIM_RECTANGLES:
outVertCount = DepthRasterClipIndexedRectangles(tx, ty, tz, vertices, indices, draw, tileScissor);
break;
case GE_PRIM_TRIANGLES:
outVertCount = DepthRasterClipIndexedTriangles(tx, ty, tz, vertices, indices, draw, tileScissor);
break;
default:
_dbg_assert_(false);
break;
}
}
{
TimeCollector collectStat(&gpuStats.msRasterizeDepth, collectStats);
DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(draw.depthAddr), draw.depthStride, tx, ty, tz, outVertCount, draw, tileScissor, lowQ);
}
}

// Reset queue
depthIndexCount_ = 0;
depthVertexCount_ = 0;
depthDraws_.clear();
EnqueueDepthDraw(draw);
}
Loading
Loading