-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy paththread.c
985 lines (962 loc) · 49.6 KB
/
thread.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
#include "chess.h"
#include "data.h"
#include "epdglue.h"
#if (CPUS > 1)
/* modified 11/04/15 */
/*
*******************************************************************************
* *
* Split() is the driver for the threaded parallel search in Crafty. The *
* basic idea is that whenever we notice that one (or more) threads are in *
* their idle loop, we drop into Split(), from Search(), and begin a new *
* parallel search from this node. This is simply a problem of establishing *
* a new split point, and then letting each thread join this split point and *
* copy whatever data they need. *
* *
* This is generation II of Split(). The primary differences address two *
* specific performance-robbing issues. (1) Excessive waiting for a split *
* to be done, and (b) excessive waiting on specific locks. Generation II *
* addresses both of these to significantly improve performance. *
* *
* The main difference between Gen I and Gen II is the effort required to *
* split the tree and which thread(s) expend this effort. In generation I, *
* the parent thread was responsible for allocating a split block for each *
* child thread, and then copying the necessary data from the parent split *
* block to these child split blocks. When all of this was completed, the *
* child processes were released to start the parallel search after being *
* held while the split / copy operations were done. In the generation II *
* Split() we now simply allocate a new split block for THIS thread, flag *
* the parent split block as joinable, and then go directly to ThreadWait() *
* which will drop us back in to the search immediately. The idle threads *
* are continually looping on Join() which will jump them right into this *
* split block letting them do ALL the work of allocating a split block, *
* filling it in, and then copying the data to their local split block. *
* This distributes the split overhead among all the threads that split, *
* rather than this thread having to do all the work while the other threads *
* sit idle. *
* *
* Generation II is also much more lightweight, in that it copies only the *
* bare minimum from parent to child. Generation I safely copied far too *
* much since this code was being changed regularly, but that is no longer *
* necessary overhead. *
* *
* Generation II has a zero contention split algorithm. In the past, when a *
* thread became idle, it posted a global split request and any thread that *
* was at an appropriate point would try to split. But while it was doing *
* the splitting, other threads that were also willing to split would "get *
* in line" because Crafty used a global lock to prevent two threads from *
* attempting to split at the same instant in time. They got in line, and *
* waited for the original splitter to release the lock, but now they have *
* no idle threads to split with. A waste of time. Now we allow ANY thread *
* to attempt to split at the current ply. When we do what might be called *
* a "gratuitous split" the only restriction is that if we have existing *
* "gratuitous split points" (split blocks that are joinable but have not *
* yet been joined), then we limit the number of such splits (per thread) to *
* avoid excessive overhead. *
* *
* Generation II takes another idea from DTS, the idea of "late-join". The *
* idea is fairly simple. If, when a thread becomes idle, there are already *
* other split points being searched in parallel, then we will try to join *
* one of them rather than waiting for someone to ask us to help. We use *
* some simple criteria: (1) The split point must be joinable, which simply *
* means that no processor has exited the split point yet (which would let *
* us know there is no more work here and a join would be futile); (2) We *
* compute an "interest" value which is a simple formula based on depth at *
* the split point, and the number of moves already searched. It seems less *
* risky to choose a split point with max depth AND minimum moves already *
* searched so that there is plenty to do. This was quite simple to add *
* after the rest of the Generation II rewrite. In fact, this is now THE *
* way threads join a split point, period, which further simplifies the code *
* and improves efficiency. IE, a thread can split when idle threads are *
* noticed, or if it is far enough away from the tips to make the cost *
* negligible. At that point any idle thread(s) can join immediately, those *
* that become idle later can join when they are ready. *
* *
* There are a number of settable options via the command-line or .craftyrc *
* initialization file. Here's a concise explanation for each option and an *
* occasional suggestion for testing/tuning. *
* *
* smp_affinity (command = smpaffinity=<n> is used to enable or disable *
* processor affinity. -1 disables affinity and lets threads run on any *
* available core. If you use an integer <n> then thread zero will bind *
* itself to cpu <n> and each additional thread will bind to the next *
* higher cpu number. This is useful if you try to run two copies of *
* crafty on the same machine, now you can cause one to bind to the first *
* <n> cores, and the second to the last <n> cores. For the first *
* instance of Crafty, you would use smpaffinity=0, and for the second *
* smpaffinity=8, assuming you are running 8 threads per copy on a 16 cpu *
* machine. If you get this wrong, you can have more than one thread on *
* the same cpu which will significantly impact performance. *
* *
* smp_max_threads (command = smpmt=n) sets the total number of allowable *
* threads for the search. The default is one (1) as Crafty does not *
* assume it should use all available resources. For optimal performance *
* this should be set to the number of physical cores your machine has, *
* which does NOT include hyperthreaded cores. *
* *
* smp_split_group (command = smpgroup=n) sets the maximum number of threads *
* at any single split point, with the exception of split points fairly *
* close to the root where ALL threads are allowed to split together, *
* ignoring this limit. Note that this is ignored in the first 1/2 of *
* the tree (the nodes closer to the root). There it is actually good to *
* split and get all active threads involved. *
* *
* smp_min_split_depth (command = smpmin=n) avoids splitting when remaining *
* depth < n. This is used to balance splitting overhead cost against *
* the speed gain the parallel search produces. The default is currently *
* 5 (which could change with future generations of Intel hardware) but *
* values between 4 and 8 will work. Larger values allow somewhat fewer *
* splits, which reduces overhead, but it also increases the percentage *
* of the time where a thread is waiting on work. *
* *
* smp_split_at_root (command = smproot=0 or 1) enables (1) or disables (0) *
* splitting the tree at the root. This defaults to 1 which produces the *
* best performance by a signficiant margin. But it can be disabled if *
* you are playing with code changes. *
* *
* smp_gratuitous_depth (command = smpgd=<n>) controls " gratuitous splits" *
* which are splits that are done without any idle threads. This sets a *
* depth limit (remaining depth) that must be present before such a split *
* can be done. Making this number larger will reduce the number of *
* these splits. Making it too small will increase overhead slightly and *
* increase split block usage significantly. *
* *
* smp_gratuitous_limit (command = smpgl=<n>) limits the number of these *
* splits that a thread can do. Once a thread has this number of *
* unjoined split points, it will not be allowed to split any more until *
* one or more threads join at least one of the existing split points. *
* In the smp search statistics, where you see output that looks like *
* this: *
* *
* splits=m(n) ... *
* *
* m is the total splits done, n is the number of "wasted splits" which *
* are basically gratuitous splits where no thread joined before this *
* split point was completed and deallocated. *
* *
* The best way to tune all of these paramenters is to use the "autotune" *
* command (see autotune.c and help autotune) which will automatically run *
* tests and optimize the parameters. More details are in the autotune.c *
* source file. *
* *
* A few basic "rules of the road" for anyone interested in changing or *
* adding to any of this code. *
* *
* 1. If, at any time, you want to modify your private split block, no lock *
* is required. *
* *
* 2. If, at any time, you want to modify another split block, such as the *
* parent split block shared move list, you must acquire the lock in the *
* split block first. IE tree->parent->lock to lock the parent split *
* block during NextMove() and such. *
* *
* 3. If you want to modify any SMP-related data that spans multiple split *
* blocks, such as telling sibling processes to stop, etc, then you must *
* acquire the global "lock_smp" lock first. This prevents a deadlock *
* caused by two different threads walking the split block chain from *
* different directions, and acquiring the split block locks in *
* different orders, which could cause a catastrophic deadlock to occur. *
* This is an infrequent event so the overhead is not significant. *
* *
* 4. If you want to do any sort of I/O operation, you must acquire the *
* "lock_io" lock first. Since threads share descriptors, there are *
* lots of potential race conditions, from the simple tty-output getting *
* interlaced from different threads, to outright data corruption in the *
* book or log files. *
* *
* Some of the bugs caused by failing to acquire the correct lock will only *
* occur infrequently, and they are extremely difficult to find. Some only *
* show up in a public game where everyone is watching, to cause maximum *
* embarassment and causes the program to do something extremely stupid. *
* *
*******************************************************************************
*/
int Split(TREE * RESTRICT tree) {
TREE *child;
int tid, tstart, tend;
/*
************************************************************
* *
* Here we prepare to split the tree. All we really do in *
* the Generation II threading is grab a split block for *
* this thread, then flag the parent as "joinable" and *
* then jump right to ThreadWait() to resume where we left *
* off, with the expectation (but not a requirement) that *
* other threads will join us to help. *
* *
* Idle threads are sitting in ThreadWait() repeatedly *
* calling Join() to find them a split point, which we are *
* fixing to provide. They will then join as quickly as *
* they can, and other threads that become idle later can *
* also join without any further splitting needed. *
* *
* If we are unable to allocate a split block, we simply *
* abort this attempted split and return to the search *
* since other threads will also split quickly. *
* *
************************************************************
*/
tstart = ReadClock();
tree->nprocs = 0;
for (tid = 0; tid < smp_max_threads; tid++)
tree->siblings[tid] = 0;
child = GetBlock(tree, tree->thread_id);
if (!child)
return 0;
CopyFromParent(child);
thread[tree->thread_id].tree = child;
tree->joined = 0;
tree->joinable = 1;
parallel_splits++;
smp_split = 0;
tend = ReadClock();
thread[tree->thread_id].idle += tend - tstart;
/*
************************************************************
* *
* We have successfully created a split point, which means *
* we are done. The instant we set the "joinable" flag, *
* idle threads may begin to join in at this split point *
* to help. Since this thread may finish before any or *
* all of the other parallel threads, this thread is sent *
* to ThreadWait() which will immediately send it to *
* SearchMoveList() like the other threads; however, going *
* to ThreadWait() allows this thread to join others if it *
* runs out of work to do. We do pass ThreadWait() the *
* address of the parent split block, so that if this *
* thread becomes idle, and this thread block shows no *
* threads are still busy, then this thread can return to *
* here and then back up into the previous ply as it *
* should. Note that no other thread can back up to the *
* previous ply since their recursive call stacks are not *
* set for that, while this call stack will bring us back *
* to this point where we return to the normal search, *
* which we just completed. *
* *
************************************************************
*/
ThreadWait(tree->thread_id, tree);
if (!tree->joined)
parallel_splits_wasted++;
return 1;
}
/* modified 12/16/15 */
/*
*******************************************************************************
* *
* Join() is called just when we enter the usual spin-loop waiting for work. *
* We take a quick look at all active split blocks to see if any look *
* "joinable". If so, we compute an "interest" value, which will be defined *
* below. We then join the most interesting split point directly. This *
* split point might have been created specifically for this thread to join, *
* or it might be one that was already active when this thread became idle, *
* which allows us to join that existing split point and not request a new *
* split operation, saving time. *
* *
*******************************************************************************
*/
int Join(int64_t tid) {
TREE *tree, *join_block, *child;
int interest, best_interest, current, pass = 0;
/*
************************************************************
* *
* First we pass over ALL split blocks, looking for those *
* flagged as "joinable" (which means they are actually *
* active split points and that no processor at that split *
* point has run out of work (there is no point in joining *
* a split point with no remaining work) and no fail high *
* has been found which would raise the "stop" flag.) This *
* is "racy" because we do not acquire any locks, which *
* means that the busy threads continue working, and there *
* is a small probability that the split point will *
* disappear while we are in this loop. To resolve the *
* potential race, after we find the most attractive split *
* point, we acquire the lock for that split block and *
* test again, but this time if the block is joinable, we *
* can safely join under control of the lock, which is not *
* held for very long at all. If the block is not *
* joinable once we acquire the lock, we abort joining *
* since it is futile. Note that if this happens, we will *
* try to find an existing split point we can join three *
* times before we exit, setting split to 1 to ask other *
* threads to produce more candidate split points. *
* *
************************************************************
*/
for (pass = 0; pass < 3; pass++) {
best_interest = -999999;
join_block = 0;
for (current = 0; current <= smp_max_threads * 64; current++) {
tree = block[current];
if (tree->joinable && (tree->ply <= tree->depth / 2 ||
tree->nprocs < smp_split_group)) {
interest = tree->depth * 2 - tree->searched[0];
if (interest > best_interest) {
best_interest = interest;
join_block = tree;
}
}
}
/*
************************************************************
* *
* Now we acquire the lock for this split block, and then *
* check to see if the block is still flagged as joinable. *
* If so, we set things up, and then we get pretty tricky *
* as we then release the lock, and then copy the data *
* from the parent to our split block. There is a chance *
* that while we are copying this data, the split point *
* gets completed by other threads. Which would leave an *
* apparent race condition exposed where we start copying *
* data here, the split point is completed, the parent *
* block is released and then reacquired and we continue *
* if nothing has happened here, getting data copied from *
* two different positions. *
* *
* Fortunately, we linked this new split block to the old *
* (original parent). If that split block is released, we *
* will discover this because that operation will also set *
* our "stop" flag which will prevent us from using this *
* data and breaking things. We allow threads to copy *
* this data without any lock protection to eliminate a *
* serialization (each node would copy the data serially, *
* rather than all at once) with the only consequence to *
* this being the overhead of copying and then throwing *
* the data away, which can happen on occasion even if we *
* used a lock for protection, since once we release the *
* lock it still takes time to get into the search and we *
* could STILL find that this split block has already been *
* completed, once again. Less contention and serial *
* computing improves performance. *
* *
************************************************************
*/
if (join_block) {
Lock(join_block->lock);
if (join_block->joinable) {
child = GetBlock(join_block, tid);
Unlock(join_block->lock);
if (child) {
CopyFromParent(child);
thread[tid].tree = child;
parallel_joins++;
return 1;
}
} else {
Unlock(join_block->lock);
break;
}
}
}
/*
************************************************************
* *
* We did not acquire a split point to join, so we set *
* smp_split to 1 to ask busy threads to create joinable *
* split points. *
* *
************************************************************
*/
smp_split = 1;
return 0;
}
/* modified 11/04/15 */
/*
*******************************************************************************
* *
* ThreadInit() is called after a process is created. Its main task is to *
* initialize the process local memory so that it will fault in and be *
* allocated on the local node rather than the node where the original *
* (first) process was running. All threads will hang here via a custom *
* WaitForALlThreadsInitialized() procedure so that all the local thread *
* blocks are usable before the search actually begins. *
* *
*******************************************************************************
*/
void *STDCALL ThreadInit(void *t) {
int tid = (int64_t) t;
ThreadAffinity(tid);
# if !defined(UNIX)
ThreadMalloc((uint64_t) tid);
# endif
thread[tid].blocks = 0xffffffffffffffffull;
Lock(lock_smp);
initialized_threads++;
Unlock(lock_smp);
WaitForAllThreadsInitialized();
ThreadWait(tid, (TREE *) 0);
Lock(lock_smp);
smp_threads--;
Unlock(lock_smp);
return 0;
}
/* modified 01/08/15 */
/*
*******************************************************************************
* *
* ThreadAffinity() is called to "pin" a thread to a specific processor. It *
* is a "noop" (no-operation) if Crafty was not compiled with -DAFFINITY, or *
* if smp_affinity is negative (smpaffinity=-1 disables affinity). It *
* simply sets the affinity for the current thread to the requested CPU and *
* returns. NOTE: If hyperthreading is enabled, there is no guarantee that *
* this will work as expected and pin one thread per physical core. It *
* depends on how the O/S numbers the SMT cores. *
* *
*******************************************************************************
*/
void ThreadAffinity(int cpu) {
# if defined(AFFINITY)
cpu_set_t cpuset;
pthread_t current_thread = pthread_self();
if (smp_affinity >= 0) {
CPU_ZERO(&cpuset);
CPU_SET(cpu + smp_affinity, &cpuset);
pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
}
# endif
}
/* modified 11/04/15 */
/*
*******************************************************************************
* *
* ThreadSplit() is used to determine if we should split at the current ply. *
* There are some basic constraints on when splits can be done, such as the *
* depth remaining in the search (don't split to near the tips), and have we *
* searched at least one move to get a score or bound (YBW condition). *
* *
* If those conditions are satisfied, AND either a thread has requested a *
* split OR we are far enough away from the tips of the tree to justify a *
* "gratuitout split" then we return "success." A "gratuitout split" is a *
* split done without any idle threads. Since splits are not free, we only *
* do this well away from tips to limit overhead. We do this so that when a *
* thread becomes idle, it will find these split points immediately and not *
* have to wait for a split after the fact. *
* *
*******************************************************************************
*/
int ThreadSplit(TREE * tree, int ply, int depth, int alpha, int o_alpha,
int done) {
TREE *used;
int64_t tblocks;
int temp, unused = 0;
/*
************************************************************
* *
* First, we see if we meet the basic criteria to create a *
* split point, that being that we must not be too far *
* from the root (smp_min_split_depth). *
* *
************************************************************
*/
if (depth < smp_min_split_depth)
return 0;
/*
************************************************************
* *
* If smp_split is NOT set, we are checking to see if it *
* is acceptable to do a gratuitous split here. *
* *
* (1) if we are too far from the root we do not do *
* gratuitous splits to avoid the overhead. *
* *
* (2) if we have searched more than one move at this ply, *
* we don't do any further tests to see if a *
* gratuitous split is acceptable, since we have *
* previously done this test at this ply and decided *
* one should not be done. That condition has likely *
* not changed. *
* *
* (3) if we have pre-existing gratuitous split points for *
* this thread, we make certain we don't create more *
* than the gratuitous split limit as excessive splits *
* just add to the overhead with no benefit. *
* *
************************************************************
*/
if (!smp_split) {
if (depth < smp_gratuitous_depth || done > 1)
return 0;
tblocks = ~thread[tree->thread_id].blocks;
while (tblocks) {
temp = LSB(tblocks);
used = block[temp + tree->thread_id * 64 + 1];
if (used->joinable && !used->joined)
unused++;
Clear(temp, tblocks);
}
if (unused > smp_gratuitous_limit)
return 0;
}
/*
************************************************************
* *
* If smp_split IS set, we are checking to see if it is *
* acceptable to do a split because there are idle threads *
* that need work to do. *
* *
* The only reason this would be false is if we have a *
* pre-existing split point that is joinable but has not *
* been joined. If one exists, there is no need to split *
* again as there is already an accessible split point. *
* Otherwise, if we are at the root and we are either not *
* allowed to split at the root, or we have additional *
* root moves that have to be searched one at a time using *
* all available threads we also can not split here. *
* *
************************************************************
*/
else {
if (ply == 1 && (!smp_split_at_root || !NextRootMoveParallel() ||
alpha == o_alpha))
return 0;
tblocks = ~thread[tree->thread_id].blocks;
while (tblocks) {
temp = LSB(tblocks);
used = block[temp + tree->thread_id * 64 + 1];
if (used->joinable && !used->joined)
unused++;
Clear(temp, tblocks);
}
if (unused > smp_gratuitous_limit)
return 0;
}
return 1;
}
/* modified 11/04/15 */
/*
*******************************************************************************
* *
* ThreadStop() is called from SearchMoveList() when it detects a beta *
* cutoff (fail high) at a node that is being searched in parallel. We need *
* to stop all threads here, and since this threading algorithm is recursive *
* it may be necessary to stop other threads that are helping search this *
* branch further down into the tree. This function simply sets appropriate *
* tree->stop variables to 1, which will stop those particular threads *
* instantly and return them to the idle loop in ThreadWait(). *
* *
*******************************************************************************
*/
void ThreadStop(TREE * RESTRICT tree) {
int proc;
Lock(tree->lock);
tree->stop = 1;
tree->joinable = 0;
for (proc = 0; proc < smp_max_threads; proc++)
if (tree->siblings[proc])
ThreadStop(tree->siblings[proc]);
Unlock(tree->lock);
}
/* modified 11/04/15 */
/*
*******************************************************************************
* *
* ThreadTrace() is a debugging tool that simply walks the split block tree *
* and displays interesting data to help debug the parallel search whenever *
* changes break things. *
* *
*******************************************************************************
*/
void ThreadTrace(TREE * RESTRICT tree, int depth, int brief) {
int proc, i;
Lock(tree->lock);
Lock(lock_io);
if (!brief) {
for (i = 0; i < 4 * depth; i++)
Print(4095, " ");
depth++;
Print(4095, "block[%d] thread=%d ply=%d nprocs=%d ",
FindBlockID(tree), tree->thread_id, tree->ply, tree->nprocs);
Print(4095, "joined=%d joinable=%d stop=%d nodes=%d", tree->joined,
tree->joinable, tree->stop, tree->nodes_searched);
Print(4095, " parent=%d\n", FindBlockID(tree->parent));
} else {
if (tree->nprocs > 1) {
for (i = 0; i < 4 * depth; i++)
Print(4095, " ");
depth++;
Print(4095, "(ply %d)", tree->ply);
}
}
if (tree->nprocs) {
if (!brief) {
for (i = 0; i < 4 * depth; i++)
Print(4095, " ");
Print(4095, " parent=%d sibling threads=",
FindBlockID(tree->parent));
for (proc = 0; proc < smp_max_threads; proc++)
if (tree->siblings[proc])
Print(4095, " %d(%d)", proc, FindBlockID(tree->siblings[proc]));
Print(4095, "\n");
} else {
if (tree->nprocs > 1) {
Print(4095, " helping= ");
for (proc = 0; proc < smp_max_threads; proc++)
if (tree->siblings[proc]) {
if (proc == tree->thread_id)
Print(4095, "[");
Print(4095, "%d", proc);
if (proc == tree->thread_id)
Print(4095, "]");
Print(4095, " ");
}
Print(4095, "\n");
}
}
}
Unlock(lock_io);
for (proc = 0; proc < smp_max_threads; proc++)
if (tree->siblings[proc])
ThreadTrace(tree->siblings[proc], depth, brief);
Unlock(tree->lock);
}
/* modified 11/04/15 */
/*
*******************************************************************************
* *
* ThreadWait() is the idle loop for the N threads that are created at the *
* beginning when Crafty searches. Threads are "parked" here waiting on a *
* pointer to something they should search (a parameter block built in the *
* function Split() in this case. When this pointer becomes non-zero, each *
* thread "parked" here will immediately call SearchMoveList() and begin the *
* parallel search as directed. *
* *
* Upon entry, all threads except for the "master" will arrive here with a *
* value of zero (0) in the waiting parameter below. This indicates that *
* they will search and them be done. The "master" will arrive here with a *
* pointer to the parent split block in "waiting" which says I will sit here *
* waiting for work OR when the waiting split block has no threads working *
* on it, at which point I should return which will let me "unsplit" here *
* and clean things up. The call to here in Split() passes this block *
* address while threads that are helping get here with a zero. *
* *
*******************************************************************************
*/
int ThreadWait(int tid, TREE * RESTRICT waiting) {
int value, tstart, tend;
/*
************************************************************
* *
* When we reach this point, one of three possible *
* conditions is true (1) we already have work to do, as *
* we are the "master thread" and we have already split *
* the tree, we are coming here to join in; (2) we are *
* the master, and we are waiting on our split point to *
* complete, so we come here to join and help currently *
* active threads; (3) we have no work to do, so we will *
* spin until Join() locates a split pont we can join to *
* help out. *
* *
* Note that when we get here, the parent already has a *
* split block and does not need to call Join(), it simply *
* falls through the while spin loop below because its *
* "tree" pointer is already non-zero. *
* *
************************************************************
*/
while (1) {
tstart = ReadClock();
while (!thread[tid].tree && (!waiting || waiting->nprocs) && !Join(tid) &&
!thread[tid].terminate)
Pause();
tend = ReadClock();
if (!thread[tid].tree)
thread[tid].tree = waiting;
thread[tid].idle += tend - tstart;
if (thread[tid].tree == waiting || thread[tid].terminate)
return 0;
/*
************************************************************
* *
* Once we get here, we have a good split point, so we are *
* ready to participate in a parallel search. Once we *
* return from SearchMoveList() we copy our results back *
* to the parent via CopyToParent() before we look for a *
* new split point. If we are a parent, we will slip out *
* of the spin loop at the top and return to the normal *
* serial search to finish up here. *
* *
* When we return from SearchMoveList(), we need to *
* decrement the "nprocs" value since there is now one *
* less thread working at this split point. *
* *
* Note: CopyToParent() marks the current split block as *
* unused once the copy is completed, so we don't have to *
* do anything about that here. *
* *
************************************************************
*/
value =
SearchMoveList(thread[tid].tree, thread[tid].tree->ply,
thread[tid].tree->depth, thread[tid].tree->wtm,
thread[tid].tree->alpha, thread[tid].tree->beta,
thread[tid].tree->searched, thread[tid].tree->in_check, 0, parallel);
tstart = ReadClock();
Lock(thread[tid].tree->parent->lock);
thread[tid].tree->parent->joinable = 0;
CopyToParent((TREE *) thread[tid].tree->parent, thread[tid].tree, value);
thread[tid].tree->parent->nprocs--;
thread[tid].tree->parent->siblings[tid] = 0;
Unlock(thread[tid].tree->parent->lock);
thread[tid].tree = 0;
tend = ReadClock();
thread[tid].idle += tend - tstart;
}
}
/* modified 11/04/15 */
/*
*******************************************************************************
* *
* CopyFromParent() is used to copy data from a parent thread to a child *
* thread. This only copies the appropriate parts of the TREE structure to *
* avoid burning memory bandwidth by copying everything. *
* *
*******************************************************************************
*/
void CopyFromParent(TREE * RESTRICT child) {
TREE *parent = child->parent;
int i, ply;
/*
************************************************************
* *
* We have allocated a split block. Now we copy the tree *
* search state from the parent block to the child in *
* preparation for starting the parallel search. *
* *
************************************************************
*/
ply = parent->ply;
child->ply = ply;
child->position = parent->position;
for (i = 0; i <= rep_index + parent->ply; i++)
child->rep_list[i] = parent->rep_list[i];
for (i = ply - 1; i < MAXPLY; i++)
child->killers[i] = parent->killers[i];
for (i = ply - 1; i <= ply; i++) {
child->curmv[i] = parent->curmv[i];
child->pv[i] = parent->pv[i];
}
child->in_check = parent->in_check;
child->last[ply] = child->move_list;
child->status[ply] = parent->status[ply];
child->status[1] = parent->status[1];
child->save_hash_key[ply] = parent->save_hash_key[ply];
child->save_pawn_hash_key[ply] = parent->save_pawn_hash_key[ply];
child->nodes_searched = 0;
child->fail_highs = 0;
child->fail_high_first_move = 0;
child->evaluations = 0;
child->egtb_probes = 0;
child->egtb_hits = 0;
child->extensions_done = 0;
child->qchecks_done = 0;
child->moves_fpruned = 0;
child->moves_mpruned = 0;
for (i = 0; i < 16; i++) {
child->LMR_done[i] = 0;
child->null_done[i] = 0;
}
child->alpha = parent->alpha;
child->beta = parent->beta;
child->value = parent->value;
child->wtm = parent->wtm;
child->depth = parent->depth;
child->searched = parent->searched;
strcpy(child->root_move_text, parent->root_move_text);
strcpy(child->remaining_moves_text, parent->remaining_moves_text);
}
/* modified 11/04/15 */
/*
*******************************************************************************
* *
* CopyToParent() is used to copy data from a child thread to a parent *
* thread. This only copies the appropriate parts of the TREE structure to *
* avoid burning memory bandwidth by copying everything. *
* *
*******************************************************************************
*/
void CopyToParent(TREE * RESTRICT parent, TREE * RESTRICT child, int value) {
int i, ply = parent->ply, which;
/*
************************************************************
* *
* The only concern here is to make sure that the info is *
* only copied to the parent if our score is > than the *
* parent value, and that we were not stopped for any *
* reason which could produce a partial score that is *
* worthless and dangerous to use. *
* *
* One important special case. If we get here with the *
* thread->stop flag set, and ply is 1, then we need to *
* clear the "this move has been searched" flag in the ply *
* 1 move list since we did not complete the search. If *
* we fail to do this, then a move being searched in *
* parallel at the root will be "lost" for this iteration *
* and won't be searched again until the next iteration. *
* *
* In any case, we add our statistical counters to the *
* parent's totals no matter whether we finished or not *
* since the total nodes searched and such should consider *
* everything searched, not just the "useful stuff." *
* *
* After we finish copying everything, we mark this split *
* block as free in the split block bitmap. *
* *
************************************************************
*/
if (child->nodes_searched && !child->stop && value > parent->value &&
!abort_search) {
parent->pv[ply] = child->pv[ply];
parent->value = value;
parent->cutmove = child->curmv[ply];
}
if (child->stop && ply == 1)
for (which = 0; which < n_root_moves; which++)
if (root_moves[which].move == child->curmv[ply]) {
root_moves[which].status &= 7;
break;
}
parent->nodes_searched += child->nodes_searched;
parent->fail_highs += child->fail_highs;
parent->fail_high_first_move += child->fail_high_first_move;
parent->evaluations += child->evaluations;
parent->egtb_probes += child->egtb_probes;
parent->egtb_hits += child->egtb_hits;
parent->extensions_done += child->extensions_done;
parent->qchecks_done += child->qchecks_done;
parent->moves_fpruned += child->moves_fpruned;
parent->moves_mpruned += child->moves_mpruned;
for (i = 1; i < 16; i++) {
parent->LMR_done[i] += child->LMR_done[i];
parent->null_done[i] += child->null_done[i];
}
which = FindBlockID(child) - 64 * child->thread_id - 1;
Set(which, thread[child->thread_id].blocks);
}
/* modified 11/04/15 */
/*
*******************************************************************************
* *
* GetBlock() is used to allocate a split block and fill in only SMP- *
* critical information. The child process will copy the rest of the split *
* block information as needed. *
* *
* When we arrive here, the parent split block must be locked since we are *
* going to change data in that block as well as copy data from that block *
* the current split block. The only exception is when this is the original *
* split operation, since this is done "out of sight" of other threads which *
* means no locks are needed until after the "joinable" flag is set, which *
* exposes this split point to other threads instantly. *
* *
*******************************************************************************
*/
TREE *GetBlock(TREE * RESTRICT parent, int tid) {
TREE *child;
static int warnings = 0;
int i, unused;
/*
************************************************************
* *
* One NUMA-related trick is that we only allocate a split *
* block in the thread's local memory. Each thread has a *
* group of split blocks that were first touched by the *
* correct CPU so that the split blocks page-faulted into *
* local memory for that specific processor. If we can't *
* find an optimally-placed block, we return a zero which *
* will prevent this thread from joining the split point. *
* This is highly unlikely as it would mean the current *
* thread has 64 active split blocks where it is waiting *
* on other threads to complete the last bit of work at *
* each. This is extremely unlikely. *
* *
* Here we use a simple 64 bit bit-map per thread that *
* indicates which blocks are free (1) and which blocks *
* used (0). We simply use LSB() to find the rightmost *
* one-bit set and that is the local block number. We *
* convert that to a global block number by doing the *
* simple computation: *
* *
* global = local + 64 * tid + 1 *
* *
* Each thread has exactly 64 split blocks, and block 0 *
* is the "master block" that never gets allocated or *
* freed. Once we find a free block for the current *
* thread, we zero that bit so that the block won't be *
* used again until it is released. *
* *
************************************************************
*/
if (thread[tid].blocks) {
unused = LSB(thread[tid].blocks);
Clear(unused, thread[tid].blocks);
Set(unused, thread[tid].max_blocks);
} else {
if (++warnings < 6)
Print(2048,
"WARNING. local SMP block cannot be allocated, thread %d\n", tid);
return 0;
}
child = block[unused + tid * 64 + 1];
/*
************************************************************
* *
* Found a split block. Now we need to fill in only the *
* critical information that can't be delayed due to race *
* conditions. When we get here, the parent split block *
* must be locked, that lets us safely update the number *
* of processors working here, etc, without any ugly race *
* conditions that would corrupt this critical data. *
* *
************************************************************
*/
for (i = 0; i < smp_max_threads; i++)
child->siblings[i] = 0;
child->nprocs = 0;
child->stop = 0;
child->joinable = 0;
child->joined = 0;
child->parent = parent;
child->thread_id = tid;
parent->nprocs++;
parent->siblings[tid] = child;
parent->joined = 1;
return child;
}
/*
*******************************************************************************
* *
* WaitForAllThreadsInitialized() waits until all smp_max_threads are *
* initialized. We have to initialize each thread and malloc() its split *
* blocks before we start the actual parallel search. Otherwise we will see *
* invalid memory accesses and crash instantly. *
* *
*******************************************************************************
*/
void WaitForAllThreadsInitialized(void) {
while (initialized_threads < smp_max_threads); /* Do nothing */
}
# if !defined (UNIX)
/* modified 01/17/09 */
/*
*******************************************************************************
* *
* ThreadMalloc() is called from the ThreadInit() function. It malloc's the *
* split blocks in the local memory for the processor associated with the *
* specific thread that is calling this code. *
* *
*******************************************************************************
*/
extern void *WinMalloc(size_t, int);
void ThreadMalloc(int64_t tid) {
int i;
for (i = tid * 64 + 1; i < tid * 64 + 65; i++) {
if (block[i] == NULL)
block[i] =
(TREE *) ((~(size_t) 127) & (127 + (size_t) WinMalloc(sizeof(TREE) +
127, tid)));
block[i]->parent = NULL;
LockInit(block[i]->lock);
}
}
# endif
#endif