diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 04058b0bc4ad2..cfc1ff03d4386 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -287,10 +287,18 @@ struct CommandListCoreFamily : CommandListImp { size_t dstSize, CmdListFillKernelArguments &outArguments, Kernel *kernel); + bool compactL3FlushEvent(bool dcFlush) const { + return this->compactL3FlushEventPacket && dcFlush; + } + bool eventSignalPipeControl(bool splitKernel, bool dcFlush) const { + return (this->pipeControlMultiKernelEventSync && splitKernel) || + compactL3FlushEvent(dcFlush); + } size_t cmdListCurrentStartOffset = 0; bool containsAnyKernel = false; bool pipeControlMultiKernelEventSync = false; + bool compactL3FlushEventPacket = false; }; template diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index d2f24a6e15504..8fa18e582b0f0 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -138,6 +138,7 @@ ze_result_t CommandListCoreFamily::initialize(Device *device, NEO this->frontEndStateTracking = L0HwHelper::enableFrontEndStateTracking(hwInfo); this->pipelineSelectStateTracking = L0HwHelper::enablePipelineSelectStateTracking(hwInfo); this->pipeControlMultiKernelEventSync = L0HwHelper::usePipeControlMultiKernelEventSync(hwInfo); + this->compactL3FlushEventPacket = L0HwHelper::useCompactL3FlushEventPacket(hwInfo); if (device->isImplicitScalingCapable() && !this->internalUsage && !isCopyOnly()) { this->partitionCount = static_cast(this->device->getNEODevice()->getDeviceBitfield().count()); @@ -1187,11 +1188,12 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, } CmdListKernelLaunchParams launchParams = {}; - + bool dcFlush = false; Event *signalEvent = nullptr; if (hSignalEvent) { signalEvent = Event::fromHandle(hSignalEvent); launchParams.isHostSignalScopeEvent = !!(signalEvent->signalScope & ZE_EVENT_SCOPE_FLAG_HOST); + dcFlush = getDcFlushRequired(!!signalEvent->signalScope); } uint32_t kernelCounter = leftSize > 0 ? 1 : 0; @@ -1199,7 +1201,7 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, kernelCounter += rightSize > 0 ? 1 : 0; launchParams.isKernelSplitOperation = kernelCounter > 1; - bool singlePipeControlPacket = this->pipeControlMultiKernelEventSync && launchParams.isKernelSplitOperation; + bool singlePipeControlPacket = eventSignalPipeControl(launchParams.isKernelSplitOperation, dcFlush); appendEventForProfilingAllWalkers(signalEvent, true, singlePipeControlPacket); @@ -1551,9 +1553,11 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, CmdListKernelLaunchParams launchParams = {}; Event *signalEvent = nullptr; + bool dcFlush = false; if (hSignalEvent) { signalEvent = Event::fromHandle(hSignalEvent); launchParams.isHostSignalScopeEvent = !!(signalEvent->signalScope & ZE_EVENT_SCOPE_FLAG_HOST); + dcFlush = getDcFlushRequired(!!signalEvent->signalScope); } if (isCopyOnly()) { @@ -1610,7 +1614,7 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, setupFillKernelArguments(dstAllocation.offset, patternSize, size, fillArguments, builtinKernel); launchParams.isKernelSplitOperation = (fillArguments.leftRemainingBytes > 0 || fillArguments.rightRemainingBytes > 0); - bool singlePipeControlPacket = this->pipeControlMultiKernelEventSync && launchParams.isKernelSplitOperation; + bool singlePipeControlPacket = eventSignalPipeControl(launchParams.isKernelSplitOperation, dcFlush); appendEventForProfilingAllWalkers(signalEvent, true, singlePipeControlPacket); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 46ddb105cc06c..700848b3d7005 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -163,20 +163,26 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K threadGroupDimensions->groupCountY, threadGroupDimensions->groupCountZ); } - NEO::GraphicsAllocation *eventAlloc = nullptr; + uint64_t eventAddress = 0; bool isTimestampEvent = false; bool l3FlushEnable = false; bool isHostSignalScopeEvent = launchParams.isHostSignalScopeEvent; + Event *compactEvent = nullptr; if (event) { - eventAlloc = &event->getAllocation(this->device); - commandContainer.addToResidencyContainer(eventAlloc); - bool flushRequired = !!event->signalScope && - !launchParams.isKernelSplitOperation; - l3FlushEnable = getDcFlushRequired(flushRequired); - isTimestampEvent = event->isUsingContextEndOffset(); - eventAddress = event->getPacketAddress(this->device); isHostSignalScopeEvent = !!(event->signalScope & ZE_EVENT_SCOPE_FLAG_HOST); + if (compactL3FlushEvent(getDcFlushRequired(!!event->signalScope))) { + compactEvent = event; + event = nullptr; + } else { + NEO::GraphicsAllocation *eventAlloc = &event->getAllocation(this->device); + commandContainer.addToResidencyContainer(eventAlloc); + bool flushRequired = !!event->signalScope && + !launchParams.isKernelSplitOperation; + l3FlushEnable = getDcFlushRequired(flushRequired); + isTimestampEvent = event->isUsingContextEndOffset(); + eventAddress = event->getPacketAddress(this->device); + } } bool isKernelUsingSystemAllocation = false; @@ -249,6 +255,10 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K std::list additionalCommands; + if (compactEvent) { + appendEventForProfilingAllWalkers(compactEvent, true, true); + } + NEO::EncodeDispatchKernelArgs dispatchKernelArgs{ eventAddress, // eventAddress neoDevice, // device @@ -273,7 +283,9 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K NEO::EncodeDispatchKernel::encode(commandContainer, dispatchKernelArgs, getLogicalStateHelper()); this->containsStatelessUncachedResource = dispatchKernelArgs.requiresUncachedMocs; - if (event) { + if (compactEvent) { + appendEventForProfilingAllWalkers(compactEvent, false, true); + } else if (event) { if (partitionCount > 1) { event->setPacketsInUse(partitionCount); } @@ -404,7 +416,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelSplit(Kernel Event *event, const CmdListKernelLaunchParams &launchParams) { if (event) { - if (this->pipeControlMultiKernelEventSync && launchParams.isKernelSplitOperation) { + if (eventSignalPipeControl(launchParams.isKernelSplitOperation, getDcFlushRequired(!!event->signalScope))) { event = nullptr; } else { event->increaseKernelCount(); diff --git a/level_zero/core/source/hw_helpers/l0_hw_helper.cpp b/level_zero/core/source/hw_helpers/l0_hw_helper.cpp index ec2ee273e69cd..45fd65bd93b4f 100644 --- a/level_zero/core/source/hw_helpers/l0_hw_helper.cpp +++ b/level_zero/core/source/hw_helpers/l0_hw_helper.cpp @@ -53,4 +53,11 @@ bool L0HwHelper::usePipeControlMultiKernelEventSync(const NEO::HardwareInfo &hwI return false; } +bool L0HwHelper::useCompactL3FlushEventPacket(const NEO::HardwareInfo &hwInfo) { + if (NEO::DebugManager.flags.CompactL3FlushEventPacket.get() != -1) { + return !!NEO::DebugManager.flags.CompactL3FlushEventPacket.get(); + } + return false; +} + } // namespace L0 diff --git a/level_zero/core/source/hw_helpers/l0_hw_helper.h b/level_zero/core/source/hw_helpers/l0_hw_helper.h index b512da30b6e07..3ad42fefed334 100644 --- a/level_zero/core/source/hw_helpers/l0_hw_helper.h +++ b/level_zero/core/source/hw_helpers/l0_hw_helper.h @@ -35,6 +35,7 @@ class L0HwHelper { static bool enableStateComputeModeTracking(const NEO::HardwareInfo &hwInfo); static bool enableImmediateCmdListHeapSharing(const NEO::HardwareInfo &hwInfo, bool cmdlistSupport); static bool usePipeControlMultiKernelEventSync(const NEO::HardwareInfo &hwInfo); + static bool useCompactL3FlushEventPacket(const NEO::HardwareInfo &hwInfo); virtual void setAdditionalGroupProperty(ze_command_queue_group_properties_t &groupProperty, NEO::EngineGroupT &group) const = 0; virtual L0::Event *createEvent(L0::EventPool *eventPool, const ze_event_desc_t *desc, L0::Device *device) const = 0; diff --git a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h index 46455c18a604f..675eba7a486b3 100644 --- a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h +++ b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h @@ -188,7 +188,9 @@ struct TestExpectedValues { uint32_t expectedKernelCount = 0; uint32_t expectedWalkerPostSyncOp = 0; uint32_t expectedPostSyncPipeControls = 0; + uint32_t expectDcFlush = 0; bool postSyncAddressZero = false; + bool workloadPartition = false; }; } // namespace ult diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 629e13a016e43..5fda5d8094aa7 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -47,6 +47,7 @@ struct WhiteBox<::L0::CommandListCoreFamily> using BaseClass::commandListPerThreadScratchSize; using BaseClass::commandListPreemptionMode; using BaseClass::commandsToPatch; + using BaseClass::compactL3FlushEventPacket; using BaseClass::containsAnyKernel; using BaseClass::containsCooperativeKernelsFlag; using BaseClass::csr; @@ -123,6 +124,7 @@ struct WhiteBox> using BaseClass::clearCommandsToPatch; using BaseClass::cmdQImmediate; using BaseClass::commandsToPatch; + using BaseClass::compactL3FlushEventPacket; using BaseClass::csr; using BaseClass::finalStreamState; using BaseClass::frontEndStateTracking; @@ -142,6 +144,7 @@ struct WhiteBox> template struct MockCommandListImmediate : public CommandListCoreFamilyImmediate { using BaseClass = CommandListCoreFamilyImmediate; + using BaseClass::compactL3FlushEventPacket; using BaseClass::containsAnyKernel; using BaseClass::immediateCmdListHeapSharing; using BaseClass::indirectAllocationsAllowed; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_copy_event_xehp_and_later.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_copy_event_xehp_and_later.cpp index c9409a0644f56..621bf88df5d71 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_copy_event_xehp_and_later.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_copy_event_xehp_and_later.cpp @@ -28,12 +28,15 @@ struct CopyTestInput { ze_event_pool_flags_t eventPoolFlags = 0; int32_t usePipeControlMultiPacketEventSync; + + bool useFirstEventPacketAddress = false; }; -template +template struct AppendMemoryCopyMultiPacketEventFixture : public DeviceFixture { void setUp() { DebugManager.flags.UsePipeControlMultiKernelEventSync.set(usePipeControlMultiPacketEventSync); + DebugManager.flags.CompactL3FlushEventPacket.set(compactL3FlushEventPacket); if (multiTile == 1) { DebugManager.flags.CreateMultipleSubDevices.set(2); DebugManager.flags.EnableImplicitScaling.set(1); @@ -170,8 +173,8 @@ void testSingleTileAppendMemoryCopyThreeKernelsAndL3Flush(CopyTestInput &input, EXPECT_EQ(static_cast(arg.expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); EXPECT_EQ(thirdKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); - uint64_t l3FlushPostSyncAddress = thirdKernelEventAddress + event->getSinglePacketSize(); - if (input.usePipeControlMultiPacketEventSync == 1) { + uint64_t l3FlushPostSyncAddress = event->getGpuAddress(input.device) + 2 * event->getSinglePacketSize() + event->getSinglePacketSize(); + if (input.usePipeControlMultiPacketEventSync == 1 || input.useFirstEventPacketAddress) { l3FlushPostSyncAddress = event->getGpuAddress(input.device); } if (event->isUsingContextEndOffset()) { @@ -291,7 +294,10 @@ void testSingleTileAppendMemoryCopySingleKernelAndL3Flush(CopyTestInput &input, EXPECT_EQ(static_cast(arg.expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); - uint64_t l3FlushPostSyncAddress = firstKernelEventAddress + event->getSinglePacketSize(); + uint64_t l3FlushPostSyncAddress = event->getGpuAddress(input.device) + event->getSinglePacketSize(); + if (input.useFirstEventPacketAddress) { + l3FlushPostSyncAddress = event->getGpuAddress(input.device); + } if (event->isUsingContextEndOffset()) { l3FlushPostSyncAddress += event->getContextEndOffset(); } @@ -496,7 +502,7 @@ void testMultiTileAppendMemoryCopyThreeKernelsAndL3Flush(CopyTestInput &input, T EXPECT_EQ(thirdKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); uint64_t l3FlushPostSyncAddress = thirdKernelEventAddress + 2 * event->getSinglePacketSize(); - if (input.usePipeControlMultiPacketEventSync == 1) { + if (input.usePipeControlMultiPacketEventSync == 1 || input.useFirstEventPacketAddress) { l3FlushPostSyncAddress = event->getGpuAddress(input.device); } if (event->isUsingContextEndOffset()) { @@ -627,7 +633,12 @@ void testMultiTileAppendMemoryCopySingleKernelAndL3Flush(CopyTestInput &input, T EXPECT_EQ(static_cast(arg.expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); - uint64_t l3FlushPostSyncAddress = firstKernelEventAddress + 2 * event->getSinglePacketSize(); + uint64_t l3FlushPostSyncAddress = 0; + if (input.useFirstEventPacketAddress) { + l3FlushPostSyncAddress = event->getGpuAddress(input.device); + } else { + l3FlushPostSyncAddress = event->getGpuAddress(input.device) + 2 * event->getSinglePacketSize(); + } if (event->isUsingContextEndOffset()) { l3FlushPostSyncAddress += event->getContextEndOffset(); } @@ -655,7 +666,7 @@ void testMultiTileAppendMemoryCopySingleKernelAndL3Flush(CopyTestInput &input, T EXPECT_EQ(expectedDcFlush, dcFlushFound); } -using AppendMemoryCopyXeHpAndLaterMultiPacket = Test>; +using AppendMemoryCopyXeHpAndLaterMultiPacket = Test>; HWTEST2_F(AppendMemoryCopyXeHpAndLaterMultiPacket, givenCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateKernels, @@ -768,7 +779,7 @@ HWTEST2_F(AppendMemoryCopyXeHpAndLaterMultiPacket, testSingleTileAppendMemoryCopySignalScopeEventToSubDevice(input, arg); } -using AppendMemoryCopyXeHpAndLaterSinglePacket = Test>; +using AppendMemoryCopyXeHpAndLaterSinglePacket = Test>; HWTEST2_F(AppendMemoryCopyXeHpAndLaterSinglePacket, givenCommandListWhenTimestampProvidedByRegisterPostSyncPassedToMemoryCopyThenAppendProfilingCalledForRegisterOnly, @@ -881,7 +892,7 @@ HWTEST2_F(AppendMemoryCopyXeHpAndLaterSinglePacket, testSingleTileAppendMemoryCopySignalScopeEventToSubDevice(input, arg); } -using MultiTileAppendMemoryCopyXeHpAndLaterMultiPacket = Test>; +using MultiTileAppendMemoryCopyXeHpAndLaterMultiPacket = Test>; HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLaterMultiPacket, givenMultiTileCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateMultiTileKernels, @@ -985,7 +996,7 @@ HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLaterMultiPacket, testMultiTileAppendMemoryCopySingleKernelAndL3Flush(input, arg); } -using MultiTileAppendMemoryCopyXeHpAndLaterSinglePacket = Test>; +using MultiTileAppendMemoryCopyXeHpAndLaterSinglePacket = Test>; HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLaterSinglePacket, givenMultiTileCommandListWhenTimestampProvidedByRegisterPostSyncPassedToMemoryCopyThenAppendProfilingCalledForMultiTileRegisterPipeControlPacket, @@ -1088,5 +1099,428 @@ HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLaterSinglePacket, testMultiTileAppendMemoryCopySingleKernelAndL3Flush(input, arg); } + +using AppendMemoryCopyL3CompactEventTest = Test>; + +HWTEST2_F(AppendMemoryCopyL3CompactEventTest, + givenCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateKernels, + IsAtLeastXeHpCore) { + arg.expectedPacketsInUse = 3; + arg.expectedKernelCount = 3; + arg.expectedWalkerPostSyncOp = 3; + arg.postSyncAddressZero = false; + + input.srcPtr = reinterpret_cast(0x1231); + input.dstPtr = reinterpret_cast(0x200002345); + input.size = 0x100002345; + + testSingleTileAppendMemoryCopyThreeKernels(input, arg); +} + +HWTEST2_F(AppendMemoryCopyL3CompactEventTest, + givenCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForSingleKernel, + IsAtLeastXeHpCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 3; + arg.postSyncAddressZero = false; + + input.srcPtr = reinterpret_cast(0x1000); + input.dstPtr = reinterpret_cast(0x20000000); + input.size = 0x100000000; + + testSingleTileAppendMemoryCopySingleKernel(input, arg); +} + +HWTEST2_F(AppendMemoryCopyL3CompactEventTest, + givenCommandListAndTimestampEventWithSignalScopeWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateKernelsAndL3FlushWithPostSyncAddedOnce, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 0; + arg.postSyncAddressZero = true; + + input.srcPtr = reinterpret_cast(0x1231); + input.dstPtr = reinterpret_cast(0x200002345); + input.size = 0x100002345; + + input.eventPoolFlags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + testSingleTileAppendMemoryCopyThreeKernelsAndL3Flush(input, arg); +} + +HWTEST2_F(AppendMemoryCopyL3CompactEventTest, + givenCommandListAndEventWithSignalScopeWhenImmediateProvidedByPipeControlPostSyncPassedToMemoryCopyThenAppendProfilingCalledForForL3FlushWithPostSyncAddedOnce, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 1; + arg.postSyncAddressZero = true; + + input.srcPtr = reinterpret_cast(0x1231); + input.dstPtr = reinterpret_cast(0x200002345); + input.size = 0x100002345; + + input.eventPoolFlags = 0; + input.useFirstEventPacketAddress = true; + + testSingleTileAppendMemoryCopyThreeKernelsAndL3Flush(input, arg); +} + +HWTEST2_F(AppendMemoryCopyL3CompactEventTest, + givenCommandListAndTimestampEventWithSignalScopeWhenTimestampProvidedByRegisterPostSyncPassedToMemoryCopyThenAppendProfilingCalledForL3FlushWithPostSyncAddedOnce, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 0; + arg.postSyncAddressZero = true; + + input.srcPtr = reinterpret_cast(0x1000); + input.dstPtr = reinterpret_cast(0x20000000); + input.size = 0x100000000; + + input.eventPoolFlags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + testSingleTileAppendMemoryCopySingleKernelAndL3Flush(input, arg); +} + +HWTEST2_F(AppendMemoryCopyL3CompactEventTest, + givenCommandListAndEventWithSignalScopeWhenImmediateProvidedByPipeControlPostSyncPassedToMemoryCopyThenAppendProfilingCalledForL3FlushWithPostSyncAddedOnce, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 1; + arg.postSyncAddressZero = true; + + input.srcPtr = reinterpret_cast(0x1000); + input.dstPtr = reinterpret_cast(0x20000000); + input.size = 0x100000000; + + input.eventPoolFlags = 0; + input.useFirstEventPacketAddress = true; + + testSingleTileAppendMemoryCopySingleKernelAndL3Flush(input, arg); +} + +using MultiTileAppendMemoryCopyL3CompactEventTest = Test>; + +HWTEST2_F(MultiTileAppendMemoryCopyL3CompactEventTest, + givenMultiTileCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateMultiTileKernels, + IsAtLeastXeHpCore) { + arg.expectedPacketsInUse = 6; + arg.expectedKernelCount = 3; + arg.expectedWalkerPostSyncOp = 3; + arg.postSyncAddressZero = false; + + input.srcPtr = reinterpret_cast(0x1231); + input.dstPtr = reinterpret_cast(0x200002345); + input.size = 0x100002345; + + testMultiTileAppendMemoryCopyThreeKernels(input, arg); +} + +HWTEST2_F(MultiTileAppendMemoryCopyL3CompactEventTest, + givenMultiTileCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForSingleSeparateMultiTileKernel, + IsAtLeastXeHpCore) { + arg.expectedPacketsInUse = 2; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 3; + arg.postSyncAddressZero = false; + + input.srcPtr = reinterpret_cast(0x1000); + input.dstPtr = reinterpret_cast(0x20000000); + input.size = 0x100000000; + + testMultiTileAppendMemoryCopySingleKernel(input, arg); +} + +HWTEST2_F(MultiTileAppendMemoryCopyL3CompactEventTest, + givenMultiTileCommandListCopyUsingThreeKernelsAndTimestampEventWithSignalScopeWhenTimestampProvidedByRegisterPostSyncPassedToMemoryCopyThenAppendProfilingCalledForL3FlushWithPostSyncAddedForScopedEvent, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 2; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 0; + arg.postSyncAddressZero = true; + + input.srcPtr = reinterpret_cast(0x1231); + input.dstPtr = reinterpret_cast(0x200002345); + input.size = 0x100002345; + + input.eventPoolFlags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + testMultiTileAppendMemoryCopyThreeKernelsAndL3Flush(input, arg); +} + +HWTEST2_F(MultiTileAppendMemoryCopyL3CompactEventTest, + givenMultiTileCommandListCopyUsingThreeKernelsAndEventWithSignalScopeWhenImmdiateProvidedByPipeControlPostSyncPassedToMemoryCopyThenAppendProfilingCalledForL3FlushWithPostSyncAddedForScopedEvent, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 2; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 1; + arg.postSyncAddressZero = true; + + input.srcPtr = reinterpret_cast(0x1231); + input.dstPtr = reinterpret_cast(0x200002345); + input.size = 0x100002345; + + input.eventPoolFlags = 0; + input.useFirstEventPacketAddress = true; + + testMultiTileAppendMemoryCopyThreeKernelsAndL3Flush(input, arg); +} + +HWTEST2_F(MultiTileAppendMemoryCopyL3CompactEventTest, + givenMultiTileCommandListCopyUsingSingleKernelAndTimestampEventWithSignalScopeWhenTimestampProvidedByRegisterPostSyncPassedToMemoryCopyThenAppendProfilingCalledForL3FlushWithPostSyncAddedForScopedEvent, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 2; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 0; + arg.postSyncAddressZero = true; + + input.srcPtr = reinterpret_cast(0x1000); + input.dstPtr = reinterpret_cast(0x20000000); + input.size = 0x100000000; + + input.eventPoolFlags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + testMultiTileAppendMemoryCopySingleKernelAndL3Flush(input, arg); +} + +HWTEST2_F(MultiTileAppendMemoryCopyL3CompactEventTest, + givenMultiTileCommandListCopyUsingSingleKernelAndEventWithSignalScopeWhenImmdiateProvidedByPipeControlPostSyncPassedToMemoryCopyThenAppendProfilingCalledForL3FlushWithPostSyncAddedForScopedEvent, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 2; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 1; + arg.postSyncAddressZero = true; + + input.srcPtr = reinterpret_cast(0x1000); + input.dstPtr = reinterpret_cast(0x20000000); + input.size = 0x100000000; + + input.eventPoolFlags = 0; + input.useFirstEventPacketAddress = true; + + testMultiTileAppendMemoryCopySingleKernelAndL3Flush(input, arg); +} + +using AppendMemoryCopyL3CompactAndSingleKernelPacketEventTest = Test>; + +HWTEST2_F(AppendMemoryCopyL3CompactAndSingleKernelPacketEventTest, + givenCommandListWhenTimestampProvidedByRegisterPostSyncPassedToMemoryCopyThenAppendProfilingCalledForSinglePacket, + IsAtLeastXeHpCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.postSyncAddressZero = true; + + input.srcPtr = reinterpret_cast(0x1231); + input.dstPtr = reinterpret_cast(0x200002345); + input.size = 0x100002345; + + testSingleTileAppendMemoryCopyThreeKernels(input, arg); +} + +HWTEST2_F(AppendMemoryCopyL3CompactAndSingleKernelPacketEventTest, + givenCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForSingleKernel, + IsAtLeastXeHpCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 3; + arg.postSyncAddressZero = false; + + input.srcPtr = reinterpret_cast(0x1000); + input.dstPtr = reinterpret_cast(0x20000000); + input.size = 0x100000000; + + testSingleTileAppendMemoryCopySingleKernel(input, arg); +} + +HWTEST2_F(AppendMemoryCopyL3CompactAndSingleKernelPacketEventTest, + givenCommandListCopyUsingThreeKernelsAndTimestampEventWithSignalScopeWhenTimestampProvidedByRegisterPostSyncPassedToMemoryCopyThenAppendProfilingCalledForL3FlushWithPostSyncAddedOnce, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 0; + arg.postSyncAddressZero = true; + + input.srcPtr = reinterpret_cast(0x1231); + input.dstPtr = reinterpret_cast(0x200002345); + input.size = 0x100002345; + + input.eventPoolFlags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + testSingleTileAppendMemoryCopyThreeKernelsAndL3Flush(input, arg); +} + +HWTEST2_F(AppendMemoryCopyL3CompactAndSingleKernelPacketEventTest, + givenCommandListCopyUsingThreeKernelsAndEventWithSignalScopeWhenImmediateProvidedByPipeControlPostSyncPassedToMemoryCopyThenAppendProfilingCalledForL3FlushWithPostSyncAddedOnce, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 1; + arg.postSyncAddressZero = true; + + input.srcPtr = reinterpret_cast(0x1231); + input.dstPtr = reinterpret_cast(0x200002345); + input.size = 0x100002345; + + input.eventPoolFlags = 0; + + testSingleTileAppendMemoryCopyThreeKernelsAndL3Flush(input, arg); +} + +HWTEST2_F(AppendMemoryCopyL3CompactAndSingleKernelPacketEventTest, + givenCommandListCopyUsingSingleKernelAndTimestampEventWithSignalScopeWhenTimestampProvidedByRegisterPostSyncPassedToMemoryCopyThenAppendProfilingCalledForL3FlushWithPostSyncAddedOnce, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 0; + arg.postSyncAddressZero = true; + + input.srcPtr = reinterpret_cast(0x1000); + input.dstPtr = reinterpret_cast(0x20000000); + input.size = 0x100000000; + + input.eventPoolFlags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + testSingleTileAppendMemoryCopySingleKernelAndL3Flush(input, arg); +} + +HWTEST2_F(AppendMemoryCopyL3CompactAndSingleKernelPacketEventTest, + givenCommandListCopyUsingSingleKernelAndEventWithSignalScopeWhenImmediateProvidedByPipeControlPostSyncPassedToMemoryCopyThenAppendProfilingCalledForL3FlushWithPostSyncAddedOnce, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 1; + arg.postSyncAddressZero = true; + + input.srcPtr = reinterpret_cast(0x1000); + input.dstPtr = reinterpret_cast(0x20000000); + input.size = 0x100000000; + + input.eventPoolFlags = 0; + input.useFirstEventPacketAddress = true; + + testSingleTileAppendMemoryCopySingleKernelAndL3Flush(input, arg); +} + +using MultiTileAppendMemoryCopyL3CompactAndSingleKernelPacketEventTest = Test>; + +HWTEST2_F(MultiTileAppendMemoryCopyL3CompactAndSingleKernelPacketEventTest, + givenMultiTileCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateMultiTileKernels, + IsAtLeastXeHpCore) { + arg.expectedPacketsInUse = 2; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.postSyncAddressZero = true; + + input.srcPtr = reinterpret_cast(0x1231); + input.dstPtr = reinterpret_cast(0x200002345); + input.size = 0x100002345; + + testMultiTileAppendMemoryCopyThreeKernels(input, arg); +} + +HWTEST2_F(MultiTileAppendMemoryCopyL3CompactAndSingleKernelPacketEventTest, + givenMultiTileCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForSingleMultiTileKernel, + IsAtLeastXeHpCore) { + arg.expectedPacketsInUse = 2; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 3; + arg.postSyncAddressZero = false; + + input.srcPtr = reinterpret_cast(0x1000); + input.dstPtr = reinterpret_cast(0x20000000); + input.size = 0x100000000; + + testMultiTileAppendMemoryCopySingleKernel(input, arg); +} + +HWTEST2_F(MultiTileAppendMemoryCopyL3CompactAndSingleKernelPacketEventTest, + givenMultiTileCommandListCopyUsingThreeKernelsAndTimestampEventWithSignalScopeWhenTimestampProvidedByRegisterPostSyncPassedToMemoryCopyThenAppendProfilingCalledForL3FlushWithPostSyncAddedForScopedEvent, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 2; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 0; + arg.postSyncAddressZero = true; + + input.srcPtr = reinterpret_cast(0x1231); + input.dstPtr = reinterpret_cast(0x200002345); + input.size = 0x100002345; + + input.eventPoolFlags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + testMultiTileAppendMemoryCopyThreeKernelsAndL3Flush(input, arg); +} + +HWTEST2_F(MultiTileAppendMemoryCopyL3CompactAndSingleKernelPacketEventTest, + givenMultiTileCommandListCopyUsingThreeKernelsAndEventWithSignalScopeWhenImmdiateProvidedByPipeControlPostSyncPassedToMemoryCopyThenAppendProfilingCalledForL3FlushWithPostSyncAddedForScopedEvent, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 2; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 1; + arg.postSyncAddressZero = true; + + input.srcPtr = reinterpret_cast(0x1231); + input.dstPtr = reinterpret_cast(0x200002345); + input.size = 0x100002345; + + input.eventPoolFlags = 0; + + testMultiTileAppendMemoryCopyThreeKernelsAndL3Flush(input, arg); +} + +HWTEST2_F(MultiTileAppendMemoryCopyL3CompactAndSingleKernelPacketEventTest, + givenMultiTileCommandListCopyUsingThreeKernelAndTimestampEventWithSignalScopeWhenTimestampProvidedByRegisterPostSyncPassedToMemoryCopyThenAppendProfilingCalledForL3FlushWithPostSyncAddedForScopedEvent, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 2; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 0; + arg.postSyncAddressZero = true; + + input.srcPtr = reinterpret_cast(0x1000); + input.dstPtr = reinterpret_cast(0x20000000); + input.size = 0x100000000; + + input.eventPoolFlags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + testMultiTileAppendMemoryCopySingleKernelAndL3Flush(input, arg); +} + +HWTEST2_F(MultiTileAppendMemoryCopyL3CompactAndSingleKernelPacketEventTest, + givenMultiTileCommandListCopyUsingSingleKernelAndEventWithSignalScopeWhenImmdiateProvidedByPipeControlPostSyncPassedToMemoryCopyThenAppendProfilingCalledForL3FlushWithPostSyncAddedForScopedEvent, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 2; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 1; + arg.postSyncAddressZero = true; + + input.srcPtr = reinterpret_cast(0x1000); + input.dstPtr = reinterpret_cast(0x20000000); + input.size = 0x100000000; + + input.eventPoolFlags = 0; + input.useFirstEventPacketAddress = true; + + testMultiTileAppendMemoryCopySingleKernelAndL3Flush(input, arg); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill_event_xehp_and_later.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill_event_xehp_and_later.cpp index d6b07b7937ffa..aed704d167044 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill_event_xehp_and_later.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill_event_xehp_and_later.cpp @@ -26,12 +26,15 @@ struct FillTestInput { void *patternPtr = nullptr; ze_event_pool_flags_t eventPoolFlags = 0; + + bool useFirstEventPacketAddress = false; }; -template +template struct AppendFillMultiPacketEventFixture : public AppendFillFixture { void setUp() { DebugManager.flags.UsePipeControlMultiKernelEventSync.set(usePipeControlMultiPacketEventSync); + DebugManager.flags.CompactL3FlushEventPacket.set(compactL3FlushEventPacket); if (multiTile == 1) { DebugManager.flags.CreateMultipleSubDevices.set(2); DebugManager.flags.EnableImplicitScaling.set(1); @@ -157,6 +160,62 @@ void testSingleTileAppendMemoryFillManyKernels(FillTestInput &input, TestExpecte EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); } +template +void testSingleTileAppendMemoryFillManyKernelsAndL3Flush(FillTestInput &input, TestExpectedValues &arg) { + using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + using OPERATION = typename POSTSYNC_DATA::OPERATION; + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = input.eventPoolFlags; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(input.driver, input.context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, input.device)); + + uint64_t firstKernelEventAddress = arg.postSyncAddressZero ? 0 : event->getGpuAddress(input.device); + uint64_t secondKernelEventAddress = arg.postSyncAddressZero ? 0 : event->getGpuAddress(input.device) + event->getSinglePacketSize(); + + auto commandList = std::make_unique>(); + commandList->initialize(input.device, NEO::EngineGroupType::RenderCompute, 0u); + auto &commandContainer = commandList->commandContainer; + + size_t usedBefore = commandContainer.getCommandStream()->getUsed(); + result = commandList->appendMemoryFill(input.dstPtr, input.patternPtr, + input.patternSize, input.allocSize, event->toHandle(), 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + size_t usedAfter = commandContainer.getCommandStream()->getUsed(); + + EXPECT_EQ(arg.expectedPacketsInUse, event->getPacketsInUse()); + EXPECT_EQ(arg.expectedKernelCount, event->getKernelCount()); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore), + usedAfter - usedBefore)); + + auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(2u, itorWalkers.size()); + auto firstWalker = itorWalkers[0]; + auto secondWalker = itorWalkers[1]; + + auto walkerCmd = genCmdCast(*firstWalker); + EXPECT_EQ(static_cast(arg.expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + walkerCmd = genCmdCast(*secondWalker); + EXPECT_EQ(static_cast(arg.expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); +} + template void testSingleTileAppendMemoryFillSingleKernel(FillTestInput &input, TestExpectedValues &arg) { using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; @@ -220,7 +279,7 @@ void testSingleTileAppendMemoryFillSingleKernelAndL3Flush(FillTestInput &input, ze_event_pool_desc_t eventPoolDesc = {}; eventPoolDesc.count = 1; - eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + eventPoolDesc.flags = input.eventPoolFlags; ze_event_desc_t eventDesc = {}; eventDesc.index = 0; @@ -263,7 +322,10 @@ void testSingleTileAppendMemoryFillSingleKernelAndL3Flush(FillTestInput &input, EXPECT_EQ(static_cast(arg.expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); - uint64_t l3FlushPostSyncAddress = firstKernelEventAddress + event->getSinglePacketSize(); + uint64_t l3FlushPostSyncAddress = event->getGpuAddress(input.device); + if (!input.useFirstEventPacketAddress) { + l3FlushPostSyncAddress += event->getSinglePacketSize(); + } if (event->isUsingContextEndOffset()) { l3FlushPostSyncAddress += event->getContextEndOffset(); } @@ -452,7 +514,7 @@ void testMultiTileAppendMemoryFillSingleKernelAndL3Flush(FillTestInput &input, T EXPECT_EQ(expectedDcFlush, dcFlushFound); } -using AppendFillMultiPacketEventTest = Test>; +using AppendFillMultiPacketEventTest = Test>; HWTEST2_F(AppendFillMultiPacketEventTest, givenCallToAppendMemoryFillWithImmediateValueWhenTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesPostSyncProfiling, @@ -506,10 +568,12 @@ HWTEST2_F(AppendFillMultiPacketEventTest, arg.expectedPostSyncPipeControls = 1; arg.postSyncAddressZero = false; + input.eventPoolFlags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + testSingleTileAppendMemoryFillSingleKernelAndL3Flush(input, arg); } -using AppendFillSinglePacketEventTest = Test>; +using AppendFillSinglePacketEventTest = Test>; HWTEST2_F(AppendFillSinglePacketEventTest, givenCallToAppendMemoryFillWithImmediateValueWhenTimestampEventUsesRegisterPostSyncThenSeparateKernelsNotUsesWalkerPostSyncProfiling, @@ -563,10 +627,12 @@ HWTEST2_F(AppendFillSinglePacketEventTest, arg.expectedPostSyncPipeControls = 1; arg.postSyncAddressZero = false; + input.eventPoolFlags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + testSingleTileAppendMemoryFillSingleKernelAndL3Flush(input, arg); } -using MultiTileAppendFillEventMultiPacketTest = Test>; +using MultiTileAppendFillEventMultiPacketTest = Test>; HWTEST2_F(MultiTileAppendFillEventMultiPacketTest, givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesWalkerPostSyncProfilingAndSingleDcFlushWithImmediatePostSync, IsAtLeastXeHpCore) { @@ -646,7 +712,7 @@ HWTEST2_F(MultiTileAppendFillEventMultiPacketTest, testMultiTileAppendMemoryFillSingleKernelAndL3Flush(input, arg); } -using MultiTileAppendFillEventSinglePacketTest = Test>; +using MultiTileAppendFillEventSinglePacketTest = Test>; HWTEST2_F(MultiTileAppendFillEventSinglePacketTest, givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeTimestampEventUsesRegisterPostSyncThenSeparateKernelsNotUsesWalkerPostSyncProfilingAndDcFlushWithNoPostSync, IsAtLeastXeHpCore) { @@ -685,5 +751,248 @@ HWTEST2_F(MultiTileAppendFillEventSinglePacketTest, testMultiTileAppendMemoryFillManyKernels(input, arg); } +using AppendFillCompactL3EventTest = Test>; + +HWTEST2_F(AppendFillCompactL3EventTest, + givenCallToAppendMemoryFillWithImmediateValueWhenTimestampEventUsesWalkerPostSyncThenSeparateKernelsUsesWalkerPostSyncProfiling, + IsAtLeastXeHpCore) { + arg.expectedPacketsInUse = 2; + arg.expectedKernelCount = 2; + arg.expectedWalkerPostSyncOp = 3; + arg.postSyncAddressZero = false; + + input.dstPtr = immediateDstPtr; + input.allocSize = immediateAllocSize; + input.patternPtr = &immediatePattern; + input.patternSize = sizeof(immediatePattern); + + testSingleTileAppendMemoryFillManyImmediateKernels(input, arg); +} + +HWTEST2_F(AppendFillCompactL3EventTest, + givenCallToAppendMemoryFillWhenTimestampEventUsesWalkerPostSyncThenSeparateKernelsUsesWalkerPostSyncProfiling, + IsAtLeastXeHpCore) { + arg.expectedPacketsInUse = 2; + arg.expectedKernelCount = 2; + arg.expectedWalkerPostSyncOp = 3; + arg.postSyncAddressZero = false; + + input.dstPtr = dstPtr; + input.allocSize = allocSize; + input.patternPtr = pattern; + input.patternSize = patternSize; + + testSingleTileAppendMemoryFillManyKernels(input, arg); +} + +HWTEST2_F(AppendFillCompactL3EventTest, + givenAppendMemoryFillUsingSinglePacketEventWhenPatternDispatchOneKernelThenUseComputeWalkerPostSync, + IsAtLeastXeHpCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 3; + arg.postSyncAddressZero = false; + + testSingleTileAppendMemoryFillSingleKernel(input, arg); +} + +HWTEST2_F(AppendFillCompactL3EventTest, + givenAppendMemoryFillUsingL3CompactEventWhenPatternDispatchOneKernelThenUseRegisterPostSync, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 0; + arg.postSyncAddressZero = true; + + input.eventPoolFlags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + testSingleTileAppendMemoryFillSingleKernelAndL3Flush(input, arg); +} + +HWTEST2_F(AppendFillCompactL3EventTest, + givenCallToAppendMemoryFillWhenL3CompactImmediateEventUsesPipeControlPostSyncThenSinglePipeControlPostSyncUsed, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 1; + arg.postSyncAddressZero = true; + + input.eventPoolFlags = 0; + input.useFirstEventPacketAddress = true; + + testSingleTileAppendMemoryFillSingleKernelAndL3Flush(input, arg); +} + +using MultiTileAppendFillCompactL3EventTest = Test>; + +HWTEST2_F(MultiTileAppendFillCompactL3EventTest, + givenMultiTileCmdListCallToAppendMemoryFillWhenPlatformNeedsDcFlushAndL3CompactTimestampEventThenRegisterPostSyncUsedOtherwiseUsesWalkerPostSyncProfiling, + IsAtLeastXeHpCore) { + if (NEO::MemorySynchronizationCommands::getDcFlushEnable(true, *defaultHwInfo)) { + arg.expectedPacketsInUse = 2; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 0; + arg.postSyncAddressZero = true; + } else { + arg.expectedPacketsInUse = 4; + arg.expectedKernelCount = 2; + arg.expectedWalkerPostSyncOp = 3; + arg.expectedPostSyncPipeControls = 0; + arg.postSyncAddressZero = false; + } + + input.eventPoolFlags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + input.dstPtr = dstPtr; + input.allocSize = allocSize; + input.patternPtr = pattern; + input.patternSize = patternSize; + + testMultiTileAppendMemoryFillManyKernels(input, arg); +} + +HWTEST2_F(MultiTileAppendFillCompactL3EventTest, + givenMultiTileCmdListCallToAppendMemoryFillWhenPlatformNeedsDcFlushAndL3CompactImmediateEventThenPipeControlPostSyncUsedOtherwiseUsesWalkerPostSyncProfiling, + IsAtLeastXeHpCore) { + if (NEO::MemorySynchronizationCommands::getDcFlushEnable(true, *defaultHwInfo)) { + arg.expectedPacketsInUse = 2; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 1; + arg.postSyncAddressZero = true; + } else { + arg.expectedPacketsInUse = 4; + arg.expectedKernelCount = 2; + arg.expectedWalkerPostSyncOp = 3; + arg.expectedPostSyncPipeControls = 0; + arg.postSyncAddressZero = false; + } + + input.eventPoolFlags = 0; + + input.dstPtr = dstPtr; + input.allocSize = allocSize; + input.patternPtr = pattern; + input.patternSize = patternSize; + + testMultiTileAppendMemoryFillManyKernels(input, arg); +} + +using AppendFillKernelSplitAndCompactL3EventTest = Test>; + +HWTEST2_F(AppendFillKernelSplitAndCompactL3EventTest, + givenCallToAppendMemoryFillWithImmediateValueWhenTimestampEventUsesRegisterPostSyncThenSeparateKernelsNotUsesWalkerPostSyncProfiling, + IsAtLeastXeHpCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.postSyncAddressZero = true; + + input.dstPtr = immediateDstPtr; + input.allocSize = immediateAllocSize; + input.patternPtr = &immediatePattern; + input.patternSize = sizeof(immediatePattern); + + testSingleTileAppendMemoryFillManyImmediateKernels(input, arg); +} + +HWTEST2_F(AppendFillKernelSplitAndCompactL3EventTest, + givenCallToAppendMemoryFillWhenTimestampEventUsesRegisterPostSyncThenSeparateKernelsNotUsesWalkerPostSyncProfiling, + IsAtLeastXeHpCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.postSyncAddressZero = true; + + input.dstPtr = dstPtr; + input.allocSize = allocSize; + input.patternPtr = pattern; + input.patternSize = patternSize; + + testSingleTileAppendMemoryFillManyKernels(input, arg); +} + +HWTEST2_F(AppendFillKernelSplitAndCompactL3EventTest, + givenAppendMemoryFillUsingSinglePacketEventWhenPatternDispatchOneKernelThenUseComputeWalkerPostSync, + IsAtLeastXeHpCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 3; + arg.postSyncAddressZero = false; + + testSingleTileAppendMemoryFillSingleKernel(input, arg); +} + +HWTEST2_F(AppendFillKernelSplitAndCompactL3EventTest, + givenAppendMemoryFillUsingL3CompactTimestampEventWhenPatternDispatchOneKernelThenUseRegisterPostSync, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 0; + arg.postSyncAddressZero = true; + + input.eventPoolFlags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + testSingleTileAppendMemoryFillSingleKernelAndL3Flush(input, arg); +} + +HWTEST2_F(AppendFillKernelSplitAndCompactL3EventTest, + givenAppendMemoryFillUsingL3CompactImmediateEventWhenPatternDispatchOneKernelThenUsePipeControlPostSync, + IsXeHpOrXeHpgCore) { + arg.expectedPacketsInUse = 1; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 1; + arg.postSyncAddressZero = true; + + input.eventPoolFlags = 0; + input.useFirstEventPacketAddress = true; + + testSingleTileAppendMemoryFillSingleKernelAndL3Flush(input, arg); +} + +using MultiTileAppendFillKernelSplitAndCompactL3EventTest = Test>; + +HWTEST2_F(MultiTileAppendFillKernelSplitAndCompactL3EventTest, + givenMultiTileCmdListCallToAppendMemoryFillWhenL3CompactTimestampEventUsesRegisterPostSyncThenSeparateKernelsNotUsesWalkerPostSyncProfilingAndDcFlushWithNoPostSync, IsAtLeastXeHpCore) { + arg.expectedPacketsInUse = 2; + arg.expectedKernelCount = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 0; + arg.postSyncAddressZero = true; + + input.eventPoolFlags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + input.dstPtr = dstPtr; + input.allocSize = allocSize; + input.patternPtr = pattern; + input.patternSize = patternSize; + + testMultiTileAppendMemoryFillManyKernels(input, arg); +} + +HWTEST2_F(MultiTileAppendFillKernelSplitAndCompactL3EventTest, + givenMultiTileCmdListCallToAppendMemoryFillWhenL3CompactImmediateEventUsesPipeControlPostSyncThenSeparateKernelsNotUsesWalkerPostSyncProfilingAndDcFlushWithImmediatePostSync, IsAtLeastXeHpCore) { + arg.expectedPacketsInUse = 2; + arg.expectedKernelCount = 1; + arg.expectedPacketsInUse = 2; + arg.expectedWalkerPostSyncOp = 0; + arg.expectedPostSyncPipeControls = 1; + arg.postSyncAddressZero = true; + + input.eventPoolFlags = 0; + + input.dstPtr = dstPtr; + input.allocSize = allocSize; + input.patternPtr = pattern; + input.patternSize = patternSize; + + testMultiTileAppendMemoryFillManyKernels(input, arg); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp index fb4a3d031e98f..2ccca6877adad 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp @@ -296,5 +296,250 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenVariousKernelsAndPatchingDisallowe pCommandList->reset(); } +struct AppendKernelTestInput { + DriverHandle *driver = nullptr; + L0::Context *context = nullptr; + L0::Device *device = nullptr; + + ze_event_pool_flags_t eventPoolFlags = 0; + + uint32_t packetOffsetMul = 1; + + bool useFirstEventPacketAddress = false; +}; + +template +struct CommandListAppendLaunchKernelCompactL3FlushEventFixture : public ModuleFixture { + void setUp() { + DebugManager.flags.CompactL3FlushEventPacket.set(compactL3FlushEventPacket); + if constexpr (multiTile == 1) { + DebugManager.flags.CreateMultipleSubDevices.set(2); + DebugManager.flags.EnableImplicitScaling.set(1); + arg.workloadPartition = true; + arg.expectDcFlush = 2; // DC Flush multi-tile platforms require DC Flush + x-tile sync after implicit scaling COMPUTE_WALKER + input.packetOffsetMul = 2; + } else { + arg.expectDcFlush = 1; + } + ModuleFixture::setUp(); + + input.driver = driverHandle.get(); + input.context = context; + input.device = device; + } + + template + void testAppendLaunchKernelAndL3Flush(AppendKernelTestInput &input, TestExpectedValues &arg) { + using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using POST_SYNC_OPERATION = typename FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION; + using OPERATION = typename POSTSYNC_DATA::OPERATION; + + Mock<::L0::Kernel> kernel; + auto module = std::unique_ptr(new Mock(input.device, nullptr)); + kernel.module = module.get(); + + auto commandList = std::make_unique>>(); + auto result = commandList->initialize(device, NEO::EngineGroupType::Compute, 0u); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = input.eventPoolFlags; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + + auto eventPool = std::unique_ptr(L0::EventPool::create(input.driver, input.context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, input.device)); + + uint64_t firstKernelEventAddress = arg.postSyncAddressZero ? 0 : event->getGpuAddress(input.device); + + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + result = commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, event->toHandle(), 0, nullptr, launchParams); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(arg.expectedPacketsInUse, event->getPacketsInUse()); + EXPECT_EQ(arg.expectedKernelCount, event->getKernelCount()); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0), + commandList->commandContainer.getCommandStream()->getUsed())); + + auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(1u, itorWalkers.size()); + auto firstWalker = itorWalkers[0]; + + auto walkerCmd = genCmdCast(*firstWalker); + EXPECT_EQ(static_cast(arg.expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + uint64_t l3FlushPostSyncAddress = event->getGpuAddress(input.device) + input.packetOffsetMul * event->getSinglePacketSize(); + if (input.useFirstEventPacketAddress) { + l3FlushPostSyncAddress = event->getGpuAddress(input.device); + } + if (event->isUsingContextEndOffset()) { + l3FlushPostSyncAddress += event->getContextEndOffset(); + } + + auto itorPipeControls = findAll(firstWalker, cmdList.end()); + + uint32_t postSyncPipeControls = 0; + uint32_t dcFlushFound = 0; + for (auto it : itorPipeControls) { + auto cmd = genCmdCast(*it); + if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { + postSyncPipeControls++; + EXPECT_EQ(l3FlushPostSyncAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); + EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData()); + if (arg.workloadPartition) { + EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable()); + } else { + EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable()); + } + } + if (cmd->getDcFlushEnable()) { + dcFlushFound++; + } + } + EXPECT_EQ(arg.expectedPostSyncPipeControls, postSyncPipeControls); + EXPECT_EQ(arg.expectDcFlush, dcFlushFound); + } + + DebugManagerStateRestore restorer; + + AppendKernelTestInput input = {}; + TestExpectedValues arg = {}; +}; + +using CommandListAppendLaunchKernelCompactL3FlushDisabledTest = Test>; + +HWTEST2_F(CommandListAppendLaunchKernelCompactL3FlushDisabledTest, + givenAppendKernelWithSignalScopeTimestampEventWhenComputeWalkerTimestampPostsyncAndL3ImmediatePostsyncUsedThenExpectComputeWalkerAndPipeControlPostsync, + IsXeHpOrXeHpgCore) { + arg.expectedKernelCount = 1; + arg.expectedPacketsInUse = 2; + arg.expectedPostSyncPipeControls = 1; + arg.expectedWalkerPostSyncOp = 3; + arg.postSyncAddressZero = false; + + input.eventPoolFlags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + testAppendLaunchKernelAndL3Flush(input, arg); +} + +HWTEST2_F(CommandListAppendLaunchKernelCompactL3FlushDisabledTest, + givenAppendKernelWithSignalScopeImmediateEventWhenComputeWalkerImmediatePostsyncAndL3ImmediatePostsyncUsedThenExpectComputeWalkerAndPipeControlPostsync, + IsXeHpOrXeHpgCore) { + arg.expectedKernelCount = 1; + arg.expectedPacketsInUse = 2; + arg.expectedPostSyncPipeControls = 1; + arg.expectedWalkerPostSyncOp = L0HwHelper::get(gfxCoreFamily).multiTileCapablePlatform() ? 3 : 1; + arg.postSyncAddressZero = false; + + input.eventPoolFlags = 0; + + testAppendLaunchKernelAndL3Flush(input, arg); +} + +using CommandListAppendLaunchKernelCompactL3FlushEnabledTest = Test>; + +HWTEST2_F(CommandListAppendLaunchKernelCompactL3FlushEnabledTest, + givenAppendKernelWithSignalScopeTimestampEventWhenRegisterTimestampPostsyncUsedThenExpectNoComputeWalkerAndPipeControlPostsync, + IsXeHpOrXeHpgCore) { + arg.expectedKernelCount = 1; + arg.expectedPacketsInUse = 1; + arg.expectedPostSyncPipeControls = 0; + arg.expectedWalkerPostSyncOp = 0; + arg.postSyncAddressZero = true; + + input.eventPoolFlags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + input.useFirstEventPacketAddress = true; + + testAppendLaunchKernelAndL3Flush(input, arg); +} + +HWTEST2_F(CommandListAppendLaunchKernelCompactL3FlushEnabledTest, + givenAppendKernelWithSignalScopeImmediateEventWhenL3ImmediatePostsyncUsedThenExpectPipeControlPostsync, + IsXeHpOrXeHpgCore) { + arg.expectedKernelCount = 1; + arg.expectedPacketsInUse = 1; + arg.expectedPostSyncPipeControls = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.postSyncAddressZero = true; + + input.eventPoolFlags = 0; + input.useFirstEventPacketAddress = true; + + testAppendLaunchKernelAndL3Flush(input, arg); +} + +using CommandListAppendLaunchKernelMultiTileCompactL3FlushDisabledTest = Test>; + +HWTEST2_F(CommandListAppendLaunchKernelMultiTileCompactL3FlushDisabledTest, + givenAppendMultiTileKernelWithSignalScopeTimestampEventWhenComputeWalkerTimestampPostsyncAndL3ImmediatePostsyncUsedThenExpectComputeWalkerAndPipeControlPostsync, + IsXeHpOrXeHpgCore) { + arg.expectedKernelCount = 1; + arg.expectedPacketsInUse = 4; + arg.expectedPostSyncPipeControls = 1; + arg.expectedWalkerPostSyncOp = 3; + arg.postSyncAddressZero = false; + + input.eventPoolFlags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + testAppendLaunchKernelAndL3Flush(input, arg); +} + +HWTEST2_F(CommandListAppendLaunchKernelMultiTileCompactL3FlushDisabledTest, + givenAppendMultiTileKernelWithSignalScopeImmediateEventWhenComputeWalkerImmediatePostsyncAndL3ImmediatePostsyncUsedThenExpectComputeWalkerAndPipeControlPostsync, + IsXeHpOrXeHpgCore) { + arg.expectedKernelCount = 1; + arg.expectedPacketsInUse = 4; + arg.expectedPostSyncPipeControls = 1; + arg.expectedWalkerPostSyncOp = 3; + arg.postSyncAddressZero = false; + + input.eventPoolFlags = 0; + + testAppendLaunchKernelAndL3Flush(input, arg); +} + +using CommandListAppendLaunchKernelMultiTileCompactL3FlushEnabledTest = Test>; + +HWTEST2_F(CommandListAppendLaunchKernelMultiTileCompactL3FlushEnabledTest, + givenAppendMultiTileKernelWithSignalScopeTimestampEventWhenRegisterTimestampPostsyncUsedThenExpectNoComputeWalkerAndPipeControlPostsync, + IsXeHpOrXeHpgCore) { + arg.expectedKernelCount = 1; + arg.expectedPacketsInUse = 2; + arg.expectedPostSyncPipeControls = 0; + arg.expectedWalkerPostSyncOp = 0; + arg.postSyncAddressZero = true; + + input.eventPoolFlags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + input.useFirstEventPacketAddress = true; + + testAppendLaunchKernelAndL3Flush(input, arg); +} + +HWTEST2_F(CommandListAppendLaunchKernelMultiTileCompactL3FlushEnabledTest, + givenAppendMultiTileKernelWithSignalScopeImmediateEventWhenL3ImmediatePostsyncUsedThenExpectPipeControlPostsync, + IsXeHpOrXeHpgCore) { + arg.expectedKernelCount = 1; + arg.expectedPacketsInUse = 2; + arg.expectedPostSyncPipeControls = 1; + arg.expectedWalkerPostSyncOp = 0; + arg.postSyncAddressZero = true; + + input.eventPoolFlags = 0; + input.useFirstEventPacketAddress = true; + + testAppendLaunchKernelAndL3Flush(input, arg); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/helper/l0_hw_helper_tests.cpp b/level_zero/core/test/unit_tests/sources/helper/l0_hw_helper_tests.cpp index ff11197a2b58e..5a09f0f12e75a 100644 --- a/level_zero/core/test/unit_tests/sources/helper/l0_hw_helper_tests.cpp +++ b/level_zero/core/test/unit_tests/sources/helper/l0_hw_helper_tests.cpp @@ -635,5 +635,11 @@ TEST_F(L0HwHelperTest, givenL0HelperWhenGettingDefaultValueForUsePipeControlMult EXPECT_FALSE(defaultValue); } +TEST_F(L0HwHelperTest, givenL0HelperWhenGettingDefaultValueForCompactL3FlushEventPacketThenReturnFalse) { + auto hwInfo = *NEO::defaultHwInfo.get(); + bool defaultValue = L0::L0HwHelper::useCompactL3FlushEventPacket(hwInfo); + EXPECT_FALSE(defaultValue); +} + } // namespace ult } // namespace L0 diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index e4f2bc2e035d8..1cb4aa5c42a7e 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -419,6 +419,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, EnableChipsetUniqueUUID, -1, "Enables retrieving DECLARE_DEBUG_VARIABLE(int32_t, EnableFlushTaskSubmission, -1, "Driver uses csr flushTask for immediate commandlist submissions, -1:default (enabled), 0:disabled, 1:enabled") DECLARE_DEBUG_VARIABLE(int32_t, EnableImmediateCmdListHeapSharing, -1, "Immediate command lists using flush task use current csr heap instead private cmd list heap, -1:default (disabled), 0:disabled, 1:enabled") DECLARE_DEBUG_VARIABLE(int32_t, UsePipeControlMultiKernelEventSync, -1, "Use single PIPE_CONTROL for event signal of multi-kernel append operations instead multi-packet POSTSYNC_DATA from each COMPUTE_WALKER, -1: default , 0: disabled, 1: enabled") +DECLARE_DEBUG_VARIABLE(int32_t, CompactL3FlushEventPacket, -1, "Compact COMPUTE_WALKER event packet and L3 Flush signal packet into single event packet, -1: default , 0: disabled, 1: enabled") DECLARE_DEBUG_VARIABLE(int32_t, EnableBcsSwControlWa, -1, "Enable BCS WA via BCSSWCONTROL MMIO. -1: default, 0: disabled, 1: if src in system mem, 2: if dst in system mem, 3: if src and dst in system mem, 4: always") /* IMPLICIT SCALING */ diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index f8a94ca57247f..be0eb0da7283f 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -202,6 +202,7 @@ LimitBlitterMaxWidth = -1 LimitBlitterMaxHeight = -1 PostBlitCommand = -1 UseCommandBufferHeaderSizeForWddmQueueSubmission = 1 +CompactL3FlushEventPacket = -1 OverridePreemptionSurfaceSizeInMb = -1 OverrideLeastOccupiedBank = -1 UseAsyncDrmExec = -1