forked from synopse/mORMot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSynFPCx64MM.pas
2863 lines (2664 loc) · 104 KB
/
SynFPCx64MM.pas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/// Fast Memory Manager for FPC x86_64
// - this unit is a part of the freeware Synopse mORMot framework
// licensed under a MPL/GPL/LGPL three license - see LICENSE.md
unit SynFPCx64MM;
{
*****************************************************************************
A Multi-thread Friendly Memory Manager for FPC written in x86_64 assembly
- targetting Linux (and Windows) multi-threaded Services
- only for FPC on the x86_64 target - use the RTL MM on Delphi or ARM
- based on FastMM4 proven algorithms by Pierre le Riche
- code has been reduced to the only necessary featureset for production
- deep asm refactoring for cross-platform, compactness and efficiency
- can report detailed statistics (with threads contention and memory leaks)
- mremap() makes large block ReallocMem a breeze on Linux :)
- inlined SSE2 movaps loop is more efficient that subfunction(s)
- lockless round-robin of tiny blocks (<=128/256 bytes) for better scaling
- optional lockless bin list to avoid freemem() thread contention
- three app modes: default mono-thread friendly, FPCMM_SERVER or FPCMM_BOOST
Usage: include this unit as the very first in your FPC project uses clause
Why another Memory Manager on FPC?
- The built-in heap.inc is well written and cross-platform and cross-CPU,
but its threadvar arena for small blocks tends to consume a lot of memory
on multi-threaded servers, and has suboptimal allocation performance
- C memory managers (glibc, Intel TBB, jemalloc) have a very high RAM
consumption (especially Intel TBB) and do panic/SIGKILL on any GPF
- Pascal alternatives (FastMM4,ScaleMM2,BrainMM) are Windows+Delphi specific
- Our lockess round-robin of tiny blocks is a unique algorithm in MM AFAIK
- It was so fun diving into SSE2 x86_64 assembly and Pierre's insight
- Resulting code is still easy to understand and maintain
IMPORTANT NOTICE: seems stable on Linux and Win64 but feedback is welcome!
*****************************************************************************
This file is part of Synopse framework.
Synopse framework. Copyright (C) 2020 Arnaud Bouchez
Synopse Informatique - https://synopse.info
*** BEGIN LICENSE BLOCK *****
Version: MPL 1.1/GPL 2.0/LGPL 2.1
The contents of this file are subject to the Mozilla Public License Version
1.1 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.mozilla.org/MPL
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
for the specific language governing rights and limitations under the License.
The Original Code is Synopse mORMot framework.
The Initial Developer of the Original Code is Arnaud Bouchez.
Portions created by the Initial Developer are Copyright (C) 2020
the Initial Developer. All Rights Reserved.
Contributor(s):
Alternatively, the contents of this file may be used under the terms of
either the GNU General Public License Version 2 or later (the "GPL"), or
the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
in which case the provisions of the GPL or the LGPL are applicable instead
of those above. If you wish to allow use of your version of this file only
under the terms of either the GPL or the LGPL, and not to allow others to
use your version of this file under the terms of the MPL, indicate your
decision by deleting the provisions above and replace them with the notice
and other provisions required by the GPL or the LGPL. If you do not delete
the provisions above, a recipient may use your version of this file under
the terms of any one of the MPL, the GPL or the LGPL.
***** END LICENSE BLOCK *****
}
{ ---- Ready-To-Use Scenarios for Memory Manager Tuning }
// by default, we target LCL/console mono-threaded apps to replace the RTL MM
// - you may define FPCMM_SERVER or even FPCMM_BOOST for a service/daemon
// if defined, set FPCMM_DEBUG and FPCMM_ASSUMEMULTITHREAD
// - those flags target well a multi-threaded service
// - consider FPCMM_BOOST to try more aggressive settings
{.$define FPCMM_SERVER}
// if defined, tiny blocks <= 256 bytes will have a bigger round-robin cycle
// - try to enable it if unexpected SmallGetmemSleepCount/SmallFreememSleepCount
// and SleepCount/SleepCycles contentions are reported by CurrentHeapStatus
// - will also use 2x (FPCMM_BOOST) or 4x (FPCMM_BOOSTER) more tiny blocks
// arenas to share among the threads - so process will consume slightly more RAM
// - warning: depending on the workload and hardware, it may actually be slower;
// consider FPCMM_SERVER as a fair alternative
{.$define FPCMM_BOOST}
{.$define FPCMM_BOOSTER}
{ ---- Fine Grained Memory Manager Tuning }
// includes more detailed information to WriteHeapStatus()
{.$define FPCMM_DEBUG}
// checks leaks and write them to the console at process shutdown
// - only basic information will be included: more debugging information (e.g.
// call stack) may be gathered using heaptrc or valgrid
{.$define FPCMM_REPORTMEMORYLEAKS}
// won't check the IsMultiThread global, but assume it is true
// - multi-threaded apps (e.g. a Server Daemon instance) will be faster with it
// - mono-threaded (console/LCL) apps are faster without this conditional
{.$define FPCMM_ASSUMEMULTITHREAD}
// let Freemem multi-thread contention use a lockless algorithm
// - on contention, Freemem won't yield the thread using an OS call, but fill
// an internal Bin list which will be released when the lock becomes available
// - from our tests on high thread contention, this may be slower on Linux, but
// sometimes slightly faster on Win64 (in a VM at least)
{.$define FPCMM_LOCKLESSFREE}
// won't use mremap but a regular getmem/move/freemem pattern
// - depending on the actual system (e.g. on a VM), mremap may be slower
{.$define FPCMM_NOMREMAP}
// on contention problem, execute "pause" opcode and spin retrying the lock
// - you may try to define this if you have more than one core, to follow Intel
// recommendation from https://software.intel.com/en-us/comment/1134767
// - on SkylakeX (Intel 7th gen), "pause" opcode went from 10-20 to 140 cycles,
// so we use rdtsc and a given number of cycles - see http://tiny.cc/toeaqz
// - from our tests on high thread contention, spinning is slower on both
// Linux and Windows, whatever Intel is advising
{.$define FPCMM_PAUSE}
// will export libc-like functions, and not replace the FPC MM
// - e.g. to use this unit as a stand-alone C memory allocator
{.$define FPCMM_STANDALONE}
interface
{$ifdef FPC}
// cut-down version of Synopse.inc to make this unit standalone
{$mode Delphi}
{$asmmode Intel}
{$inline on}
{$R-} // disable Range checking
{$S-} // disable Stack checking
{$W-} // disable stack frame generation
{$Q-} // disable overflow checking
{$B-} // expect short circuit boolean
{$ifdef CPUX64}
{$define FPC_CPUX64} // this unit is for FPC + x86_64 only
{$endif CPUX64}
{$ifdef FPCMM_BOOSTER}
{$define FPCMM_BOOST}
{$undef FPCMM_DEBUG} // when performance matters more than stats
{$endif FPCMM_BOOSTER}
{$ifdef FPCMM_BOOST}
{$undef FPCMM_SERVER}
{$define FPCMM_ASSUMEMULTITHREAD}
{$endif FPCMM_BOOST}
{$ifdef FPCMM_SERVER}
{$define FPCMM_DEBUG}
{$define FPCMM_ASSUMEMULTITHREAD}
{$endif FPCMM_SERVER}
{$endif FPC}
{$ifdef FPC_CPUX64}
// this unit is available only for FPC + X86_64 CPU
type
/// Arena (middle/large) heap information as returned by CurrentHeapStatus
TMMStatusArena = record
/// how many bytes are currently reserved (mmap) to the Operating System
CurrentBytes: PtrUInt;
/// how many bytes have been reserved (mmap) to the Operating System
CumulativeBytes: PtrUInt;
{$ifdef FPCMM_DEBUG}
/// maximum bytes count reserved (mmap) to the Operating System
PeakBytes: PtrUInt;
/// how many VirtualAlloc/mmap calls to the Operating System did occur
CumulativeAlloc: PtrUInt;
/// how many VirtualFree/munmap calls to the Operating System did occur
CumulativeFree: PtrUInt;
{$endif FPCMM_DEBUG}
/// how many times this Arena did wait from been unlocked by another thread
SleepCount: PtrUInt;
end;
/// heap information as returned by CurrentHeapStatus
TMMStatus = record
/// how many tiny/small memory blocks (<=2600) are currently allocated
SmallBlocks: PtrUInt;
/// how many bytes of tiny/small memory blocks are currently allocated
// - this size is part of the Medium.CurrentBytes arena
SmallBlocksSize: PtrUInt;
/// contain blocks up to 256KB (small and medium blocks)
Medium: TMMStatusArena;
/// large blocks > 256KB which are directly handled by the Operating System
Large: TMMStatusArena;
{$ifdef FPCMM_DEBUG}
/// how much rdtsc cycles were spent within SwitchToThread/NanoSleep API
// - we rdtsc since it is an indicative but very fast way of timing
SleepCycles: PtrUInt;
{$ifdef FPCMM_LOCKLESSFREE}
/// how many types Freemem() did spin to acquire its lock-less bin list
SmallFreememLockLessSpin: PtrUInt;
{$endif FPCMM_LOCKLESSFREE}
{$endif FPCMM_DEBUG}
/// how many times the Operating System Sleep/NanoSleep API was called
// - in a perfect world, should be as small as possible
SleepCount: PtrUInt;
/// how many times Getmem() did block and wait for a small block
// - see also GetSmallBlockContention()
SmallGetmemSleepCount: PtrUInt;
/// how many times Freemem() did block and wait for a small block
// - see also GetSmallBlockContention()
SmallFreememSleepCount: PtrUInt;
end;
PMMStatus = ^TMMStatus;
/// allocate a new memory buffer
// - as FPC default heap, _Getmem(0) returns _Getmem(1)
function _GetMem(size: PtrUInt): pointer;
/// allocate a new zeroed memory buffer
function _AllocMem(Size: PtrUInt): pointer;
/// release a memory buffer
// - returns the allocated size of the supplied pointer (as FPC default heap)
function _FreeMem(P: pointer): PtrUInt;
/// change the size of a memory buffer
// - won't move any data if in-place reallocation is possible
// - as FPC default heap, _ReallocMem(P=nil,Size) maps P := _getmem(Size) and
// _ReallocMem(P,0) maps _Freemem(P)
function _ReallocMem(var P: pointer; Size: PtrUInt): pointer;
/// retrieve the maximum size (i.e. the allocated size) of a memory buffer
function _MemSize(P: pointer): PtrUInt; inline;
/// retrieve high-level statistics about the current memory manager state
// - see also GetSmallBlockContention for detailed small blocks information
function CurrentHeapStatus: TMMStatus;
{$ifdef FPCMM_STANDALONE}
/// should be called before using any memory function
procedure InitializeMemoryManager;
/// should be called to finalize this memory manager process and release all RAM
procedure FreeAllMemory;
{$undef FPCMM_DEBUG} // excluded FPC-specific debugging
/// IsMultiThread global variable is not correct outside of the FPC RTL
{$define FPCMM_ASSUMEMULTITHREAD}
/// not supported to reduce dependencies and console writing
{$undef FPCMM_REPORTMEMORYLEAKS}
{$else}
type
/// one GetSmallBlockContention info about unexpected multi-thread waiting
// - a single GetmemBlockSize or FreememBlockSize non 0 field is set
TSmallBlockContention = packed record
/// how many times a small block getmem/freemem has been waiting for unlock
SleepCount: cardinal;
/// the small block size on which Getmem() has been blocked - or 0
GetmemBlockSize: cardinal;
/// the small block size on which Freemem() has been blocked - or 0
FreememBlockSize: cardinal;
end;
/// small blocks detailed information as returned GetSmallBlockContention
TSmallBlockContentionDynArray = array of TSmallBlockContention;
/// one GetSmallBlockStatus information
TSmallBlockStatus = packed record
/// how many times a memory block of this size has been allocated
Total: cardinal;
/// how many memory blocks of this size are currently allocated
Current: cardinal;
/// the standard size of the small memory block
BlockSize: cardinal;
end;
/// small blocks detailed information as returned GetSmallBlockStatus
TSmallBlockStatusDynArray = array of TSmallBlockStatus;
/// sort order of detailed information as returned GetSmallBlockStatus
TSmallBlockOrderBy = (obTotal, obCurrent, obBlockSize);
/// retrieve the use counts of allocated small blocks
// - returns maxcount biggest results, sorted by "orderby" field occurence
function GetSmallBlockStatus(maxcount: integer = 10;
orderby: TSmallBlockOrderBy = obTotal; count: PPtrUInt = nil;
bytes: PPtrUInt = nil): TSmallBlockStatusDynArray;
/// retrieve all small blocks which suffered from blocking during multi-thread
// - returns maxcount biggest results, sorted by SleepCount occurence
function GetSmallBlockContention(maxcount: integer = 10): TSmallBlockContentionDynArray;
/// convenient debugging function into the console
// - if smallblockcontentioncount > 0, includes GetSmallBlockContention() info
// up to the smallblockcontentioncount biggest occurences
procedure WriteHeapStatus(const context: shortstring = '';
smallblockstatuscount: integer = 8; smallblockcontentioncount: integer = 8;
compilationflags: boolean = false);
{$endif FPCMM_STANDALONE}
{$endif FPC_CPUX64}
implementation
{
High-level Algorithms Description
-----------------------------------
The allocator handles the following families of memory blocks:
- TINY <= 128 B (or <= 256 B for FPCMM_BOOST) - not existing in FastMM4
Round-robin distribution into several arenas, fed from medium blocks
(fair scaling from multi-threaded calls, with no threadvar nor GC involved)
- SMALL <= 2600 B
Single arena per block size, fed from medium blocks
- MEDIUM <= 256 KB
Pool of bitmap-marked chunks, fed from 1MB of OS mmap/virtualalloc
- LARGE > 256 KB
Directly fed from OS mmap/virtualalloc with mremap when growing
About locking:
- Tiny and Small blocks have their own per-size lock, in every arena
- Medium and Large blocks have one giant lock each (seldom used)
- SwitchToThread/FpNanoSleep OS call is done after initial spinning
- FPCMM_LOCKLESSFREE reduces OS calls on Freemem() thread contention
- FPCMM_DEBUG / WriteHeapStatus allows to identify the lock contention
}
{$ifdef FPC_CPUX64}
// this unit is available only for FPC + X86_64 CPU
{ ********* Operating System Specific API Calls }
{$ifdef MSWINDOWS}
var
HeapStatus: TMMStatus;
const
kernel32 = 'kernel32.dll';
MEM_COMMIT = $1000;
MEM_RESERVE = $2000;
MEM_RELEASE = $8000;
MEM_FREE = $10000;
MEM_TOP_DOWN = $100000;
PAGE_READWRITE = 4;
function VirtualAlloc(lpAddress: pointer;
dwSize: PtrUInt; flAllocationType, flProtect: Cardinal): pointer; stdcall;
external kernel32 name 'VirtualAlloc';
function VirtualFree(lpAddress: pointer; dwSize: PtrUInt;
dwFreeType: Cardinal): LongBool; stdcall;
external kernel32 name 'VirtualFree';
procedure SwitchToThread; stdcall;
external kernel32 name 'SwitchToThread';
function AllocMedium(Size: PtrInt): pointer; inline;
begin
// bottom-up allocation to reduce fragmentation
result := VirtualAlloc(nil, Size, MEM_COMMIT, PAGE_READWRITE);
end;
function AllocLarge(Size: PtrInt): pointer; inline;
begin
// top-down allocation to reduce fragmentation
result := VirtualAlloc(nil, Size, MEM_COMMIT or MEM_TOP_DOWN, PAGE_READWRITE);
end;
procedure Free(ptr: pointer; Size: PtrInt); inline;
begin
VirtualFree(ptr, 0, MEM_RELEASE);
end;
{$define FPCMM_NOMREMAP}
{$else}
uses
{$ifndef DARWIN}
syscall,
{$endif DARWIN}
BaseUnix;
var
HeapStatus: TMMStatus;
// we directly call the Kernel, so this unit doesn't require any libc
function AllocMedium(Size: PtrInt): pointer; inline;
begin
result := fpmmap(nil, Size, PROT_READ or PROT_WRITE,
MAP_PRIVATE or MAP_ANONYMOUS, -1, 0);
end;
function AllocLarge(Size: PtrInt): pointer; inline;
begin
result := fpmmap(nil, Size, PROT_READ or PROT_WRITE,
MAP_PRIVATE or MAP_ANONYMOUS, -1, 0);
end;
procedure Free(ptr: pointer; Size: PtrInt); inline;
begin
Size := fpmunmap(ptr, Size);
// assert(Size = 0);
end;
{$ifdef LINUX}
{$ifndef FPCMM_NOMREMAP}
const
syscall_nr_mremap = 25; // valid on x86_64 Linux and Android
MREMAP_MAYMOVE = 1;
function fpmremap(addr: pointer; old_len, new_len: size_t; may_move: longint): pointer; inline;
begin
result := pointer(do_syscall(syscall_nr_mremap, TSysParam(addr),
TSysParam(old_len), TSysParam(new_len), TSysParam(may_move)));
end;
{$endif FPCMM_NOMREMAP}
{$else BSD}
{$define FPCMM_NOMREMAP} // mremap is a Linux-specific syscall
{$endif LINUX}
procedure SwitchToThread; inline;
var
t: Ttimespec;
begin
// note: nanosleep() adds a few dozen of microsecs for context switching
t.tv_sec := 0;
t.tv_nsec := 10; // empirically identified on a recent Linux Kernel
fpnanosleep(@t, nil);
end;
{$endif MSWINDOWS}
{$ifdef FPCMM_DEBUG}
procedure ReleaseCore; nostackframe; assembler;
asm
rdtsc
shl rdx, 32
or rax, rdx
push rax
call SwitchToThread
pop rcx
rdtsc
shl rdx, 32
or rax, rdx
lea rdx, [rip + HeapStatus]
sub rax, rcx
lock xadd qword ptr [rdx + TMMStatus.SleepCycles], rax
lock inc qword ptr [rdx + TMMStatus.SleepCount]
end;
{$else}
procedure ReleaseCore;
begin
SwitchToThread;
inc(HeapStatus.SleepCount); // indicative counter
end;
{$endif FPCMM_DEBUG}
{ ********* Some Assembly Helpers }
procedure NotifyAlloc(var Arena: TMMStatusArena; Size: PtrUInt);
nostackframe; assembler;
asm
mov rax, Size
lock xadd qword ptr [Arena].TMMStatusArena.CurrentBytes, rax
lock xadd qword ptr [Arena].TMMStatusArena.CumulativeBytes, Size
{$ifdef FPCMM_DEBUG}
lock inc qword ptr [Arena].TMMStatusArena.CumulativeAlloc
mov rax, qword ptr [Arena].TMMStatusArena.CurrentBytes
cmp rax, qword ptr [Arena].TMMStatusArena.PeakBytes
jbe @s
mov qword ptr [Arena].TMMStatusArena.PeakBytes, rax
@s: {$endif FPCMM_DEBUG}
end;
procedure NotifyFree(var Arena: TMMStatusArena; Size: PtrUInt);
nostackframe; assembler;
asm
neg Size
lock xadd qword ptr [Arena].TMMStatusArena.CurrentBytes, Size
{$ifdef FPCMM_DEBUG}
lock inc qword ptr [Arena].TMMStatusArena.CumulativeFree
{$endif FPCMM_DEBUG}
end;
// faster than Move() as called from ReallocateLargeBlock
procedure MoveLarge(src, dst: pointer; cnt: PtrInt); nostackframe; assembler;
asm
sub cnt, 8
add src, cnt
add dst, cnt
neg cnt
jns @z
align 16
@s: movaps xmm0, oword ptr [src + cnt] // AVX move is not really faster
movntdq oword ptr [dst + cnt], xmm0 // non-temporal loop
add cnt, 16
js @s
sfence
@z: mov rax, qword ptr [src + cnt]
mov qword ptr [dst + cnt], rax
end;
{ ********* Constants and Data Structures Definitions }
const
{$ifdef FPCMM_BOOST} // someimtes the more arenas, the better multi-threadable
{$ifdef FPCMM_BOOSTER}
NumTinyBlockTypesPO2 = 4;
NumTinyBlockArenasPO2 = 5; // will probably end up with Medium lock contention
{$else}
NumTinyBlockTypesPO2 = 4; // tiny are <= 256 bytes
NumTinyBlockArenasPO2 = 4; // 16 + 1 arenas
{$endif FPCMM_BOOSTER}
{$else}
NumTinyBlockTypesPO2 = 3; // multiple arenas for tiny blocks <= 128 bytes
NumTinyBlockArenasPO2 = 3; // 8 round-robin arenas + 1 main by default
{$endif FPCMM_BOOST}
NumSmallBlockTypes = 46;
MaximumSmallBlockSize = 2608;
SmallBlockSizes: array[0..NumSmallBlockTypes - 1] of word = (
16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256,
272, 288, 304, 320, 352, 384, 416, 448, 480, 528, 576, 624, 672, 736, 800,
880, 960, 1056, 1152, 1264, 1376, 1504, 1648, 1808, 1984, 2176, 2384,
MaximumSmallBlockSize, MaximumSmallBlockSize, MaximumSmallBlockSize);
NumTinyBlockTypes = 1 shl NumTinyBlockTypesPO2;
NumTinyBlockArenas = 1 shl NumTinyBlockArenasPO2;
NumSmallInfoBlock = NumSmallBlockTypes + NumTinyBlockArenas * NumTinyBlockTypes;
SmallBlockGranularity = 16;
TargetSmallBlocksPerPool = 48;
MinimumSmallBlocksPerPool = 12;
SmallBlockDownsizeCheckAdder = 64;
SmallBlockUpsizeAdder = 32;
{$ifdef FPCMM_LOCKLESSFREE}
SmallBlockTypePO2 = 8; // SizeOf(TSmallBlockType)=256
SmallBlockBinCount = (((1 shl SmallBlockTypePO2) - 64) div 8) - 1;
{$else}
SmallBlockTypePO2 = 6;
{$endif FPCMM_LOCKLESSFREE}
MediumBlockPoolSizeMem = 20 * 64 * 1024;
MediumBlockPoolSize = MediumBlockPoolSizeMem - 16;
MediumBlockSizeOffset = 48;
MinimumMediumBlockSize = 11 * 256 + MediumBlockSizeOffset;
MediumBlockBinsPerGroup = 32;
MediumBlockBinGroupCount = 32;
MediumBlockBinCount = MediumBlockBinGroupCount * MediumBlockBinsPerGroup;
MediumBlockGranularity = 256;
MaximumMediumBlockSize =
MinimumMediumBlockSize + (MediumBlockBinCount - 1) * MediumBlockGranularity;
OptimalSmallBlockPoolSizeLowerLimit =
29 * 1024 - MediumBlockGranularity + MediumBlockSizeOffset;
OptimalSmallBlockPoolSizeUpperLimit =
64 * 1024 - MediumBlockGranularity + MediumBlockSizeOffset;
MaximumSmallBlockPoolSize =
OptimalSmallBlockPoolSizeUpperLimit + MinimumMediumBlockSize;
LargeBlockGranularity = 65536;
MediumInPlaceDownsizeLimit = MinimumMediumBlockSize div 4;
IsFreeBlockFlag = 1;
IsMediumBlockFlag = 2;
IsSmallBlockPoolInUseFlag = 4;
IsLargeBlockFlag = 4;
PreviousMediumBlockIsFreeFlag = 8;
LargeBlockIsSegmented = 8;
DropSmallFlagsMask = -8;
ExtractSmallFlagsMask = 7;
DropMediumAndLargeFlagsMask = -16;
ExtractMediumAndLargeFlagsMask = 15;
// use pause before ReleaseCore API call when spinning locks
// pause is 140 cycles since SkylakeX - see http://tiny.cc/010ioz -> use rdtsc
// which has 30 cycles latency; ring3 to ring 0 transition is 1000 cycles
{$ifdef FPCMM_PAUSE}
SpinSmallGetmemLockTSC = 1000;
SpinSmallFreememLockTSC = 1000; // _freemem has more collisions
{$ifdef FPCMM_LOCKLESSFREE}
SpinSmallFreememBinTSC = 2000;
{$endif FPCMM_LOCKLESSFREE}
SpinMediumLockTSC = 2000;
SpinLargeLockTSC = 2000;
{$else}
SpinMediumLockTSC = 1000; // minimum spinning
SpinLargeLockTSC = 1000;
{$endif FPCMM_PAUSE}
type
PSmallBlockPoolHeader = ^TSmallBlockPoolHeader;
// information for each small block size - 64/256 bytes long >= CPU cache line
TSmallBlockType = record
BlockTypeLocked: boolean;
AllowedGroupsForBlockPoolBitmap: Byte;
BlockSize: Word;
MinimumBlockPoolSize: Word;
OptimalBlockPoolSize: Word;
NextPartiallyFreePool: PSmallBlockPoolHeader;
PreviousPartiallyFreePool: PSmallBlockPoolHeader;
NextSequentialFeedBlockAddress: pointer;
MaxSequentialFeedBlockAddress: pointer;
CurrentSequentialFeedPool: PSmallBlockPoolHeader;
GetmemCount: cardinal;
FreememCount: cardinal;
GetmemSleepCount: cardinal;
FreememSleepCount: cardinal;
{$ifdef FPCMM_LOCKLESSFREE} // 192 optional bytes for FreeMem Bin
BinLocked: boolean;
BinCount: byte;
BinSpinCount: cardinal;
BinInstance: array[0.. SmallBlockBinCount - 1] of pointer;
{$endif FPCMM_LOCKLESSFREE}
end;
PSmallBlockType = ^TSmallBlockType;
TSmallBlockTypes = array[0..NumSmallBlockTypes - 1] of TSmallBlockType;
TTinyBlockTypes = array[0..NumTinyBlockTypes - 1] of TSmallBlockType;
TSmallBlockInfo = record
Small: TSmallBlockTypes;
Tiny: array[0..NumTinyBlockArenas - 1] of TTinyBlockTypes;
GetmemLookup: array[0..
(MaximumSmallBlockSize - 1) div SmallBlockGranularity] of byte;
{$ifndef FPCMM_ASSUMEMULTITHREAD}
IsMultiThreadPtr: PBoolean; // safe access to IsMultiThread global variable
{$endif FPCMM_ASSUMEMULTITHREAD}
TinyCurrentArena: integer;
end;
TSmallBlockPoolHeader = record
BlockType: PSmallBlockType;
{$ifdef CPU32}
Padding32Bits: cardinal;
{$endif}
NextPartiallyFreePool: PSmallBlockPoolHeader;
PreviousPartiallyFreePool: PSmallBlockPoolHeader;
FirstFreeBlock: pointer;
BlocksInUse: Cardinal;
SmallBlockPoolSignature: Cardinal;
FirstBlockPoolPointerAndFlags: PtrUInt;
end;
PMediumBlockPoolHeader = ^TMediumBlockPoolHeader;
TMediumBlockPoolHeader = record
PreviousMediumBlockPoolHeader: PMediumBlockPoolHeader;
NextMediumBlockPoolHeader: PMediumBlockPoolHeader;
Reserved1: PtrUInt;
FirstMediumBlockSizeAndFlags: PtrUInt;
end;
PMediumFreeBlock = ^TMediumFreeBlock;
TMediumFreeBlock = record
PreviousFreeBlock: PMediumFreeBlock;
NextFreeBlock: PMediumFreeBlock;
end;
TMediumBlockInfo = record
Locked: boolean;
PoolsCircularList: TMediumBlockPoolHeader;
LastSequentiallyFed: pointer;
SequentialFeedBytesLeft: Cardinal;
BinGroupBitmap: Cardinal;
{$ifndef FPCMM_ASSUMEMULTITHREAD}
IsMultiThreadPtr: PBoolean; // safe access to IsMultiThread global variable
{$endif FPCMM_ASSUMEMULTITHREAD}
BinBitmaps: array[0..MediumBlockBinGroupCount - 1] of Cardinal;
Bins: array[0..MediumBlockBinCount - 1] of TMediumFreeBlock;
end;
PLargeBlockHeader = ^TLargeBlockHeader;
TLargeBlockHeader = record
PreviousLargeBlockHeader: PLargeBlockHeader;
NextLargeBlockHeader: PLargeBlockHeader;
Reserved1: PtrUInt;
BlockSizeAndFlags: PtrUInt;
end;
const
BlockHeaderSize = SizeOf(pointer);
SmallBlockPoolHeaderSize = SizeOf(TSmallBlockPoolHeader);
MediumBlockPoolHeaderSize = SizeOf(TMediumBlockPoolHeader);
LargeBlockHeaderSize = SizeOf(TLargeBlockHeader);
var
SmallBlockInfo: TSmallBlockInfo;
MediumBlockInfo: TMediumBlockInfo;
LargeBlocksLocked: boolean;
LargeBlocksCircularList: TLargeBlockHeader;
{ ********* Shared Routines }
procedure LockMediumBlocks; nostackframe; assembler;
asm
// on input/output: r10=MediumBlockInfo
@s: rdtsc // tsc in edx:eax
shl rdx, 32
lea r9, [rax + rdx + SpinMediumLockTSC] // r9 = endtsc
@sp: pause
rdtsc
shl rdx, 32
or rax, rdx
cmp rax, r9
ja @rc // timeout
mov rcx, r10
mov eax, $100
cmp byte ptr [r10].TMediumBlockInfo.Locked, true
je @sp
lock cmpxchg byte ptr [rcx].TMediumBlockInfo.Locked, ah
je @ok
jmp @sp
@rc: push rsi // preserve POSIX ABI registers
push rdi
push r10
push r11
call ReleaseCore
pop r11
pop r10
pop rdi
pop rsi
lea rax, [rip + HeapStatus]
lock inc qword ptr [rax].TMMStatus.Medium.SleepCount
jmp @s
@ok:
end;
procedure InsertMediumBlockIntoBin; nostackframe; assembler;
asm
// rcx=MediumFreeBlock edx=MediumBlockSize r10=MediumBlockInfo - even on POSIX
mov rax, rcx
// Get the bin number for this block size
sub edx, MinimumMediumBlockSize
shr edx, 8
// Validate the bin number
sub edx, MediumBlockBinCount - 1
sbb ecx, ecx
and edx, ecx
add edx, MediumBlockBinCount - 1
mov r9, rdx
// Get the bin address in rcx
shl edx, 4
lea rcx, [r10 + rdx + TMediumBlockInfo.Bins]
// Bins are LIFO, se we insert this block as the first free block in the bin
mov rdx, TMediumFreeBlock[rcx].NextFreeBlock
mov TMediumFreeBlock[rax].PreviousFreeBlock, rcx
mov TMediumFreeBlock[rax].NextFreeBlock, rdx
mov TMediumFreeBlock[rdx].PreviousFreeBlock, rax
mov TMediumFreeBlock[rcx].NextFreeBlock, rax
// Was this bin empty?
cmp rdx, rcx
jne @Done
// Get ecx=bin number, edx=group number
mov rcx, r9
mov rdx, r9
shr edx, 5
// Flag this bin as not empty
mov eax, 1
shl eax, cl
or dword ptr [r10 + TMediumBlockInfo.BinBitmaps + rdx * 4], eax
// Flag the group as not empty
mov eax, 1
mov ecx, edx
shl eax, cl
or [r10 + TMediumBlockInfo.BinGroupBitmap], eax
@Done:
end;
procedure RemoveMediumFreeBlock; nostackframe; assembler;
asm
// rcx=MediumFreeBlock r10=MediumBlockInfo - even on POSIX
// Get the current previous and next blocks
mov rdx, TMediumFreeBlock[rcx].PreviousFreeBlock
mov rcx, TMediumFreeBlock[rcx].NextFreeBlock
// Remove this block from the linked list
mov TMediumFreeBlock[rcx].PreviousFreeBlock, rdx
mov TMediumFreeBlock[rdx].NextFreeBlock, rcx
// Is this bin now empty? If the previous and next free block pointers are
// equal, they must point to the bin
cmp rcx, rdx
jne @Done
// Get ecx=bin number, edx=group number
lea r8, [r10 + TMediumBlockInfo.Bins]
sub rcx, r8
mov edx, ecx
shr ecx, 4
shr edx, 9
// Flag this bin as empty
mov eax, -2
rol eax, cl
and dword ptr [r10 + TMediumBlockInfo.BinBitmaps + rdx * 4], eax
jnz @Done
// Flag this group as empty
mov eax, -2
mov ecx, edx
rol eax, cl
and [r10 + TMediumBlockInfo.BinGroupBitmap], eax
@Done:
end;
procedure BinMediumSequentialFeedRemainder; nostackframe; assembler;
asm
// r10=MediumBlockInfo - even on POSIX
mov eax, [r10 + TMediumBlockInfo.SequentialFeedBytesLeft]
test eax, eax
jz @Done
// Is the last fed sequentially block free?
mov rax, [r10 + TMediumBlockInfo.LastSequentiallyFed]
test byte ptr [rax - BlockHeaderSize], IsFreeBlockFlag
jnz @LastBlockFedIsFree
// Set the "previous block is free" flag in the last block fed
or qword ptr [rax - BlockHeaderSize], PreviousMediumBlockIsFreeFlag
// Get edx=remainder size, rax=remainder start
mov edx, [r10 + TMediumBlockInfo.SequentialFeedBytesLeft]
sub rax, rdx
@BinTheRemainder:
// Store the size of the block as well as the flags
lea rcx, [rdx + IsMediumBlockFlag + IsFreeBlockFlag]
mov [rax - BlockHeaderSize], rcx
// Store the trailing size marker
mov [rax + rdx - 16], rdx
// Bin this medium block
cmp edx, MinimumMediumBlockSize
jb @Done
mov rcx, rax
call InsertMediumBlockIntoBin // rcx=APMediumFreeBlock, edx=AMediumBlockSize
@Done: ret
@LastBlockFedIsFree:
// Drop the flags
mov rdx, DropMediumAndLargeFlagsMask
and rdx, [rax - BlockHeaderSize]
// Free the last block fed
cmp edx, MinimumMediumBlockSize
jb @DontRemoveLastFed
// Last fed block is free - remove it from its size bin
mov rcx, rax
call RemoveMediumFreeBlock // rcx = APMediumFreeBlock
// Re-read rax and rdx
mov rax, [r10 + TMediumBlockInfo.LastSequentiallyFed]
mov rdx, DropMediumAndLargeFlagsMask
and rdx, [rax - BlockHeaderSize]
@DontRemoveLastFed:
// Get the number of bytes left in ecx
mov ecx, [r10 + TMediumBlockInfo.SequentialFeedBytesLeft]
// rax = remainder start, rdx = remainder size
sub rax, rcx
add edx, ecx
jmp @BinTheRemainder
end;
procedure FreeMedium(ptr: PMediumBlockPoolHeader);
begin
Free(ptr, MediumBlockPoolSizeMem);
NotifyFree(HeapStatus.Medium, MediumBlockPoolSizeMem);
end;
function AllocNewSequentialFeedMediumPool(blocksize: Cardinal): pointer;
var
old: PMediumBlockPoolHeader;
new: pointer;
begin
BinMediumSequentialFeedRemainder;
new := AllocMedium(MediumBlockPoolSizeMem);
with MediumblockInfo do
if new <> nil then
begin
old := PoolsCircularList.NextMediumBlockPoolHeader;
PMediumBlockPoolHeader(new).PreviousMediumBlockPoolHeader := @PoolsCircularList;
PoolsCircularList.NextMediumBlockPoolHeader := new;
PMediumBlockPoolHeader(new).NextMediumBlockPoolHeader := old;
old.PreviousMediumBlockPoolHeader := new;
PPtrUInt(PByte(new) + MediumBlockPoolSize - BlockHeaderSize)^ := IsMediumBlockFlag;
SequentialFeedBytesLeft :=
(MediumBlockPoolSize - MediumBlockPoolHeaderSize) - blocksize;
result := pointer(PByte(new) + MediumBlockPoolSize - blocksize);
LastSequentiallyFed := result;
PPtrUInt(PByte(result) - BlockHeaderSize)^ := blocksize or IsMediumBlockFlag;
NotifyAlloc(HeapStatus.Medium, MediumBlockPoolSizeMem);
end
else
begin
SequentialFeedBytesLeft := 0;
result := nil;
end;
end;
procedure LockLargeBlocks; nostackframe; assembler;
asm
@s: mov eax, $100
lea rcx, [rip + LargeBlocksLocked]
lock cmpxchg byte ptr [rcx], ah
je @ok
rdtsc
shl rdx, 32
lea r9, [rax + rdx + SpinLargeLockTSC] // r9 = endtsc
@sp: pause
rdtsc
shl rdx, 32
or rax, rdx
cmp rax, r9
ja @rc // timeout
mov eax, $100
cmp byte ptr [rcx], ah // don't flush the CPU cache if Locked still true
je @sp
lock cmpxchg byte ptr [rcx], ah
je @ok
jmp @sp
@rc: call ReleaseCore
lea rax, [rip + HeapStatus]
lock inc qword ptr [rax].TMMStatus.Large.SleepCount
jmp @s
@ok:
end;
function AllocateLargeBlockFrom(size: PtrUInt;
existing: pointer; oldsize: PtrUInt): pointer;
var
blocksize: PtrUInt;
header, old: PLargeBlockHeader;
begin
blocksize := (size + LargeBlockHeaderSize +
LargeBlockGranularity - 1 + BlockHeaderSize) and -LargeBlockGranularity;
if existing = nil then
header := AllocLarge(blocksize)
else
{$ifdef FPCMM_NOMREMAP}
header := nil; // paranoid
{$else}
header := fpmremap(existing, oldsize, blocksize, MREMAP_MAYMOVE);
{$endif FPCMM_NOMREMAP}
if header <> nil then
begin
NotifyAlloc(HeapStatus.Large, blocksize);
if existing <> nil then
NotifyFree(HeapStatus.Large, oldsize);
header.BlockSizeAndFlags := blocksize or IsLargeBlockFlag;
LockLargeBlocks;
old := LargeBlocksCircularList.NextLargeBlockHeader;
header.PreviousLargeBlockHeader := @LargeBlocksCircularList;
LargeBlocksCircularList.NextLargeBlockHeader := header;
header.NextLargeBlockHeader := old;
old.PreviousLargeBlockHeader := header;
LargeBlocksLocked := False;
inc(header);
end;
result := header;
end;
function AllocateLargeBlock(size: PtrUInt): pointer;
begin
result := AllocateLargeBlockFrom(size, nil, 0);
end;
procedure FreeLarge(ptr: PLargeBlockHeader; size: PtrUInt);
begin
NotifyFree(HeapStatus.Large, size);
Free(ptr, size);
end;
function FreeLargeBlock(p: pointer): PtrInt;
var
header, prev, next: PLargeBlockHeader;