Skip to content

Commit

Permalink
memcpy test
Browse files Browse the repository at this point in the history
  • Loading branch information
CptMoore committed Jan 15, 2025
1 parent dc9097e commit 78b138d
Show file tree
Hide file tree
Showing 2 changed files with 155 additions and 12 deletions.
1 change: 1 addition & 0 deletions ModTek/Features/Logging/AppenderFile.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ internal AppenderFile(string path, AppenderSettings settings)
ModTek v{GitVersionInformation.InformationalVersion} ({GitVersionInformation.CommitDate}) ; HarmonyX {typeof(Harmony).Assembly.GetName().Version}
{Environment.OSVersion} ; BattleTech {Application.version} ; Unity {Application.unityVersion} ; CLR {Environment.Version} ; {System.Runtime.InteropServices.RuntimeInformation.FrameworkDescription}"
{dateTime.ToLocalTime().ToString("o", CultureInfo.InvariantCulture)} ; Startup {unityStartupTime.ToString(null, CultureInfo.InvariantCulture)} ; Ticks {stopwatchTimestamp} ; Timestamp Overhead {MTStopwatch.OverheadPerTimestampInNanoseconds}ns ; MemCpy->BlockCpy threshold {FastBuffer.MemcpyThreshold}
Memcpy512oTicks {FastBuffer.Memcpy512oTicks} ; Memcpy1024Ticks {FastBuffer.Memcpy1024Ticks} ; Memcpy512Ticks {FastBuffer.Memcpy512Ticks} ; Memcpy256Ticks {FastBuffer.Memcpy256Ticks} ; Memcpy128Ticks {FastBuffer.Memcpy128Ticks} ; Memcpy64Ticks {FastBuffer.Memcpy64Ticks}
{new string('-', 80)}
"""
Expand Down
166 changes: 154 additions & 12 deletions ModTek/Features/Logging/FastBuffer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ internal void Append(byte[] value)
{
fixed (byte* bytes = value)
{
Memcpy256(position, bytes, value.Length);
Memcpy512(position, bytes, value.Length);
}
}
}
Expand Down Expand Up @@ -115,7 +115,7 @@ private static int FindMemCpyThreshold()
var start = shouldMeasure ? MTStopwatch.GetTimestamp() : 0;
fixed (byte* bytes = srcA)
{
Memcpy256(dst, bytes, size);
Memcpy512(dst, bytes, size);
}
if (shouldMeasure)
{
Expand Down Expand Up @@ -317,11 +317,90 @@ private void EnlargeCapacity(int targetLength)
_isG2 = false;
}

// from Buffer.memcpy* and optimized to use wider types like 128 and 256 bit
// JIT can do xmm (128) and cpu can optimize 2x xmm (2x128) further it seems
internal static void Memcpy64(byte* dest, byte* src, int size)
{
{
const int BatchSize = sizeof(ulong);
for (; size >= BatchSize; size -= BatchSize)
{
*(ulong*)dest = *(ulong*)src;
dest += BatchSize;
src += BatchSize;
}
}
{
const int BatchSize = sizeof(ushort);
for (; size >= BatchSize; size -= BatchSize)
{
*(ushort*)dest = *(ushort*)src;
dest += BatchSize;
src += BatchSize;
}
}
if (size > 0)
{
*dest = *src;
}
}

internal static void Memcpy128(byte* dest, byte* src, int size)
{
const int BatchSize = My128Bit.Size;
for (; size >= BatchSize; size -= BatchSize)
{
*(My128Bit*)dest = *(My128Bit*)src;
dest += BatchSize;
src += BatchSize;
}
Memcpy64(dest, src, size);
}

internal static void Memcpy256(byte* dest, byte* src, int size)
{
{ // 25% faster than if using 2x128 on AMD Zen4 hardware
const int BatchSize = My256Bit.Size;
for (; size >= BatchSize; size -= BatchSize)
{
*(My256Bit*)dest = *(My256Bit*)src;
dest += BatchSize;
src += BatchSize;
}
Memcpy128(dest, src, size);
}
internal static void Memcpy512(byte* dest, byte* src, int size)
{
const int BatchSize = My512Bit.Size;
for (; size >= BatchSize; size -= BatchSize)
{
*(My512Bit*)dest = *(My512Bit*)src;
dest += BatchSize;
src += BatchSize;
}
Memcpy256(dest, src, size);
}
internal static void Memcpy1024(byte* dest, byte* src, int size)
{
const int BatchSize = My1024Bit.Size;
for (; size >= BatchSize; size -= BatchSize)
{
*(My1024Bit*)dest = *(My1024Bit*)src;
dest += BatchSize;
src += BatchSize;
}
Memcpy512(dest, src, size);
}

internal static void Memcpy512o(byte* dest, byte* src, int size)
{
{
const int BatchSize = My512Bit.Size;
for (; size >= BatchSize; size -= BatchSize)
{
*(My512Bit*)dest = *(My512Bit*)src;
dest += BatchSize;
src += BatchSize;
}
}
{
const int BatchSize = My256Bit.Size;
for (; size >= BatchSize; size -= BatchSize)
{
Expand All @@ -330,7 +409,7 @@ internal static void Memcpy256(byte* dest, byte* src, int size)
src += BatchSize;
}
}
{ // 100% faster than if using 2x64 on xmm hardware
{
const int BatchSize = My128Bit.Size;
for (; size >= BatchSize; size -= BatchSize)
{
Expand Down Expand Up @@ -363,20 +442,83 @@ internal static void Memcpy256(byte* dest, byte* src, int size)
}
}

// the jit can optimize this to 2x xmm 128 ops
// and 2x 128bit ops together are 25% faster than looping over 128bit ops
private struct My128Bit
// AVX2 - Intel® Core™ i7-10875H (Bluewinds)
// Memcpy512oTicks 140 ; Memcpy1024Ticks 655 ; Memcpy512Ticks 135 ; Memcpy256Ticks 135 ; Memcpy128Ticks 153 ; Memcpy64Ticks 150

// AVX2 - AMD 6850U (CptMoore)
// Memcpy512oTicks 140 ; Memcpy1024Ticks 667 ; Memcpy512Ticks 139 ; Memcpy256Ticks 147 ; Memcpy128Ticks 152 ; Memcpy64Ticks 159

// AVX512 Double Pump -
//

// AVX512 -
//

// SSE -
//

// should translate to 8x128 ops
private struct My1024Bit
{
internal const int Size = 128/8;
internal long _00;
internal long _01;
internal const int Size = 1024/8;
internal My512Bit _00;
internal My512Bit _01;
}
// should translate to 4x128 ops
private struct My512Bit
{
internal const int Size = 512/8;
internal My256Bit _00;
internal My256Bit _01;
}
// should translate to 2x128 ops
private struct My256Bit
{
internal const int Size = 256/8;
internal My128Bit _00;
internal My128Bit _01;
}
// should translate to xmm 128 op
private struct My128Bit
{
internal const int Size = 128/8;
internal long _00;
internal long _01;
}

internal static readonly long Memcpy64Ticks = CalcMemcpyTicks(Memcpy64);
internal static readonly long Memcpy128Ticks = CalcMemcpyTicks(Memcpy128);
internal static readonly long Memcpy256Ticks = CalcMemcpyTicks(Memcpy256);
internal static readonly long Memcpy512Ticks = CalcMemcpyTicks(Memcpy512);
internal static readonly long Memcpy1024Ticks = CalcMemcpyTicks(Memcpy1024);
internal static readonly long Memcpy512oTicks = CalcMemcpyTicks(Memcpy512o);

private delegate void Memcpy(byte* dst, byte* src, int size);
private static long CalcMemcpyTicks(Memcpy memcpy)
{
const int MaxSize = 512 * 1024 - 1;
var src = stackalloc byte[MaxSize];
var dst = stackalloc byte[MaxSize];

const int TestRunsPerSize = 100;
var memCpyTicks = new long[TestRunsPerSize];

const int WarmupCount = 1000;
for (var w = 0; w < WarmupCount + 1; w++)
{
var shouldMeasure = w == WarmupCount;
for (var run = 0; run < TestRunsPerSize; run++)
{
var start = shouldMeasure ? MTStopwatch.GetTimestamp() : 0;
memcpy(dst, src, MaxSize);
if (shouldMeasure)
{
memCpyTicks[run] = MTStopwatch.GetTimestamp() - start;
}
}
}
return MTStopwatch.TicksMin(memCpyTicks);
}

~FastBuffer()
{
Expand Down

0 comments on commit 78b138d

Please sign in to comment.