forked from privacore/open-source-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCollectiondb.cpp
2427 lines (2062 loc) · 73.3 KB
/
Collectiondb.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include "Collectiondb.h"
#include "Xml.h"
#include "Url.h"
#include "Loop.h"
#include "Spider.h" // for calling SpiderLoop::collectionsUpdated()
#include "SpiderLoop.h"
#include "SpiderColl.h"
#include "Doledb.h"
#include "Posdb.h"
#include "Titledb.h"
#include "Tagdb.h"
#include "Spider.h"
#include "Clusterdb.h"
#include "Linkdb.h"
#include "SpiderCache.h"
#include "Repair.h"
#include "Parms.h"
#include "Process.h"
#include "HttpRequest.h"
#include "Dir.h"
#include "File.h"
#include "Conf.h"
#include "Mem.h"
#include "Errno.h"
#include "utf8_fast.h"
#include <sys/stat.h> //mkdir()
static HashTableX g_collTable;
// a global class extern'd in .h file
Collectiondb g_collectiondb;
Collectiondb::Collectiondb ( ) {
m_wrapped = 0;
m_numRecs = 0;
m_numRecsUsed = 0;
m_initializing = false;
m_recs = NULL;
// sanity
if ( RDB_END2 >= RDB_END ) {
return;
}
log("db: increase RDB_END2 to at least %" PRId32" in Collectiondb.h",(int32_t)RDB_END);
g_process.shutdownAbort(true);
}
Collectiondb::~Collectiondb() {
reset();
}
// reset rdb
void Collectiondb::reset() {
log(LOG_INFO,"db: resetting collectiondb.");
for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
if ( ! m_recs[i] ) {
continue;
}
mdelete ( m_recs[i], sizeof(CollectionRec), "CollectionRec" );
delete ( m_recs[i] );
m_recs[i] = NULL;
}
m_numRecs = 0;
m_numRecsUsed = 0;
g_collTable.reset();
}
// . save to disk
// . returns false if blocked, true otherwise
bool Collectiondb::save ( ) {
if ( g_conf.m_readOnlyMode ) {
return true;
}
if ( g_inAutoSave && m_numRecsUsed > 20 && g_hostdb.m_myHostId != 0 ) {
return true;
}
// which collection rec needs a save
for (int32_t i = 0; i < m_numRecs; i++) {
if (!m_recs[i]) {
continue;
}
m_recs[i]->save();
}
// oh well
return true;
}
///////////
//
// fill up our m_recs[] array based on the coll.*.*/coll.conf files
//
///////////
bool Collectiondb::loadAllCollRecs ( ) {
m_initializing = true;
char dname[1024];
// MDW: sprintf ( dname , "%s/collections/" , g_hostdb.m_dir );
sprintf ( dname , "%s" , g_hostdb.m_dir );
Dir d;
d.set ( dname );
if ( ! d.open ()) {
log( LOG_WARN, "admin: Could not load collection config files." );
return false;
}
// note it
//log(LOG_INFO,"db: loading collection config files.");
// . scan through all subdirs in the collections dir
// . they should be like, "coll.main/" and "coll.mycollection/"
while ( const char *f = d.getNextFilename ( "coll.*" ) ) {
// must end on a digit (i.e. coll.main.0)
if ( ! is_digit (f[strlen(f)-1]) ) continue;
// point to collection
const char *coll = f + 5;
// NULL terminate at .
const char *pp = strchr ( coll , '.' );
if ( ! pp ) continue;
char collname[256];
memcpy(collname,coll,pp-coll);
collname[pp-coll] = '\0';
// get collnum
collnum_t collnum = atol ( pp + 1 );
// add it
if ( ! addExistingColl ( collname, collnum ) )
return false;
}
// if no existing recs added... add coll.main.0 always at startup
if ( m_numRecs == 0 ) {
log("admin: adding main collection.");
addNewColl ( "main", 0 );
}
m_initializing = false;
return true;
}
// after we've initialized all rdbs in main.cpp call this to clean out
// our rdb trees
bool Collectiondb::cleanTrees() {
// remove any nodes with illegal collnums
g_posdb.getRdb()->cleanTree();
g_titledb.getRdb()->cleanTree();
g_tagdb.getRdb()->cleanTree();
g_spiderdb.getRdb_deprecated()->cleanTree();
g_doledb.getRdb()->cleanTree();
g_clusterdb.getRdb()->cleanTree();
g_linkdb.getRdb()->cleanTree();
// success
return true;
}
// same as addOldColl()
bool Collectiondb::addExistingColl ( const char *coll, collnum_t collnum ) {
int32_t i = collnum;
// ensure does not already exist in memory
collnum_t oldCollnum = getCollnum(coll);
if ( oldCollnum >= 0 ) {
g_errno = EEXIST;
log("admin: Trying to create collection \"%s\" but already exists in memory. "
"Do an ls on the working dir to see if there are two collection dirs with the same coll name",
coll);
g_process.shutdownAbort(true);
}
// also try by #, i've seen this happen too
CollectionRec *ocr = getRec ( i );
if ( ocr ) {
g_errno = EEXIST;
log(LOG_WARN, "admin: Collection id %i is in use already by %s, so we can not add %s. moving %s to trash.",
(int)i, ocr->m_coll,coll,coll);
SafeBuf cmd;
int64_t now = gettimeofdayInMilliseconds();
cmd.safePrintf ( "mv coll.%s.%i trash/coll.%s.%i.%" PRIu64
, coll
,(int)i
, coll
,(int)i
, now );
//log("admin: %s",cmd.getBufStart());
gbsystem ( cmd.getBufStart() );
return true;
}
// create the record in memory
CollectionRec *cr;
try {
cr = new (CollectionRec);
}
catch(std::bad_alloc&) {
log( LOG_WARN, "admin: Failed to allocated %" PRId32" bytes for new collection record for '%s'.",
(int32_t)sizeof(CollectionRec),coll);
return false;
}
mnew ( cr , sizeof(CollectionRec) , "CollectionRec" );
// get the default.conf from working dir if there
g_parms.setToDefault( (char *)cr , OBJ_COLL , cr );
strcpy ( cr->m_coll , coll );
cr->m_collLen = strlen ( coll );
cr->m_collnum = i;
// point to this, so Rdb and RdbBase can reference it
coll = cr->m_coll;
//log("admin: loaded old coll \"%s\"",coll);
// load coll.conf file
if ( ! cr->load ( coll , i ) ) {
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
log(LOG_WARN, "admin: Failed to load coll.%s.%" PRId32"/coll.conf",coll,i);
delete ( cr );
if ( m_recs ) m_recs[i] = NULL;
return false;
}
if ( ! registerCollRec ( cr ) ) return false;
// we need to compile the regular expressions or update the url
// filters with new logic that maps crawlbot parms to url filters
return cr->rebuildUrlFilters ( );
}
// . add a new rec
// . returns false and sets g_errno on error
// . was addRec()
// . "isDump" is true if we don't need to initialize all the rdbs etc
// because we are doing a './gb dump ...' cmd to dump out data from
// one Rdb which we will custom initialize in main.cpp where the dump
// code is. like for instance, posdb.
bool Collectiondb::addNewColl ( const char *coll,
// Parms.cpp reserves this so it can be sure
// to add the same collnum to every shard
collnum_t newCollnum ) {
//do not send add/del coll request until we are in sync with shard!!
// just return ETRYAGAIN for the parmlist...
if( !coll ) {
logError("Called with NULL coll parameter");
return false;
}
// ensure coll name is legit
const char *p = coll;
for ( ; *p ; p++ ) {
if ( is_alnum_a(*p) ) continue;
if ( *p == '-' ) continue;
if ( *p == '_' ) continue; // underscore now allowed
break;
}
if ( *p ) {
g_errno = EBADENGINEER;
log( LOG_WARN, "admin: '%s' is a malformed collection name because it contains the '%c' character.",coll,*p);
return false;
}
if ( newCollnum < 0 ) { g_process.shutdownAbort(true); }
// if empty... bail, no longer accepted, use "main"
if ( !coll[0] ) {
g_errno = EBADENGINEER;
log( LOG_WARN, "admin: Trying to create a new collection but no collection name provided. "
"Use the 'c' cgi parameter to specify it.");
return false;
}
// or if too big
if ( strlen(coll) > MAX_COLL_LEN ) {
g_errno = ENOBUFS;
log(LOG_WARN, "admin: Trying to create a new collection whose name '%s' of %zd chars is longer than the max of %" PRId32" chars.",
coll, strlen(coll), (int32_t)MAX_COLL_LEN );
return false;
}
// ensure does not already exist in memory
if ( getCollnum ( coll ) >= 0 ) {
g_errno = EEXIST;
log( LOG_WARN, "admin: Trying to create collection '%s' but already exists in memory.",coll);
// just let it pass...
g_errno = 0 ;
return true;
}
// MDW: ensure not created on disk since time of last load
char dname[512];
sprintf(dname, "%scoll.%s.%" PRId32"/",g_hostdb.m_dir,coll,(int32_t)newCollnum);
DIR *dir = opendir ( dname );
if ( dir ) {
closedir ( dir );
g_errno = EEXIST;
log(LOG_WARN, "admin: Trying to create collection %s but directory %s already exists on disk.",coll,dname);
return false;
}
// create the record in memory
CollectionRec *cr;
try {
cr = new (CollectionRec);
}
catch(std::bad_alloc&) {
log( LOG_WARN, "admin: Failed to allocated %" PRId32" bytes for new collection record for '%s'.",
( int32_t ) sizeof( CollectionRec ), coll );
return false;
}
// register the mem
mnew ( cr , sizeof(CollectionRec) , "CollectionRec" );
// get the default.conf from working dir if there
g_parms.setToDefault( (char *)cr , OBJ_COLL , cr );
// set coll id and coll name for coll id #i
strcpy ( cr->m_coll , coll );
cr->m_collLen = strlen ( coll );
cr->m_collnum = newCollnum;
// point to this, so Rdb and RdbBase can reference it
coll = cr->m_coll;
cr->setNeedsSave();
//
// BEGIN NEW CODE
//
// . just the basics on these for now
// . if certain parms are changed then the url filters
// must be rebuilt, as well as possibly the waiting tree!!!
// . need to set m_urlFiltersHavePageCounts etc.
cr->rebuildUrlFilters ( );
cr->m_useRobotsTxt = true;
// note that
log("colldb: initial revival for %s",cr->m_coll);
// start the spiders!
cr->m_spideringEnabled = true;
//
// END NEW CODE
//
//log("admin: adding coll \"%s\" (new=%" PRId32")",coll,(int32_t)isNew);
// MDW: create the new directory
if ( ::mkdir ( dname, getDirCreationFlags() ) ) {
g_errno = errno;
mdelete ( cr , sizeof(CollectionRec) , "CollectionRec" );
delete ( cr );
log( LOG_WARN, "admin: Creating directory %s had error: %s.", dname,mstrerror(g_errno));
return false;
}
// save it into this dir... might fail!
if ( ! cr->save() ) {
mdelete ( cr , sizeof(CollectionRec) , "CollectionRec" );
delete ( cr );
log( LOG_WARN, "admin: Failed to save file %s: %s", dname,mstrerror(g_errno));
return false;
}
if ( ! registerCollRec ( cr ) ) {
return false;
}
// add the rdbbases for this coll, CollectionRec::m_bases[]
return addRdbBasesForCollRec ( cr );
}
void CollectionRec::setBasePtr(rdbid_t rdbId, class RdbBase *base) {
if ( rdbId < 0 || rdbId >= RDB_END ) { g_process.shutdownAbort(true); }
// Rdb::deleteColl() will call this even though we are swapped in
// but it calls it with "base" set to NULL after it nukes the RdbBase
// so check if base is null here.
if ( base && m_bases[ (unsigned char)rdbId ]){ g_process.shutdownAbort(true); }
m_bases [ (unsigned char)rdbId ] = base;
}
// . returns NULL w/ g_errno set on error.
// . TODO: ensure not called from in thread, not thread safe
RdbBase *CollectionRec::getBase(rdbid_t rdbId) {
if ( rdbId < 0 || rdbId >= RDB_END ) { g_process.shutdownAbort(true); }
return m_bases[(unsigned char)rdbId];
}
// . called only by addNewColl() and by addExistingColl()
bool Collectiondb::registerCollRec ( CollectionRec *cr ) {
// add m_recs[] and to hashtable
return setRecPtr ( cr->m_collnum , cr );
}
// swap it in
bool Collectiondb::addRdbBaseToAllRdbsForEachCollRec ( ) {
for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
CollectionRec *cr = m_recs[i];
if ( ! cr ) continue;
// add rdb base files etc. for it
addRdbBasesForCollRec ( cr );
}
// now clean the trees. moved this into here from
// addRdbBasesForCollRec() since we call addRdbBasesForCollRec()
// now from getBase() to load on-demand for saving memory
cleanTrees();
return true;
}
bool Collectiondb::addRdbBasesForCollRec ( CollectionRec *cr ) {
char *coll = cr->m_coll;
//////
//
// if we are doing a dump from the command line, skip this stuff
//
//////
if ( g_dumpMode ) return true;
// tell rdbs to add one, too
if ( ! g_posdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_titledb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_tagdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_clusterdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_linkdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_spiderdb.getRdb_deprecated()->addRdbBase1(coll) ) goto hadError;
if ( ! g_doledb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
// now clean the trees
//cleanTrees();
// debug message
//log ( LOG_INFO, "db: verified collection \"%s\" (%" PRId32").",
// coll,(int32_t)cr->m_collnum);
// tell SpiderCache about this collection, it will create a
// SpiderCollection class for it.
//g_spiderCache.reset1();
// success
return true;
hadError:
log(LOG_WARN, "db: error registering coll: %s",mstrerror(g_errno));
return false;
}
/// this deletes the collection, not just part of a reset.
bool Collectiondb::deleteRec2 ( collnum_t collnum ) {
// do not allow this if in repair mode
if ( g_repair.isRepairActive() && g_repair.isRepairingColl(collnum) ) {
log(LOG_WARN, "admin: Can not delete collection while in repair mode.");
g_errno = EBADENGINEER;
return true;
}
// bitch if not found
if ( collnum < 0 ) {
g_errno = ENOTFOUND;
log(LOG_LOGIC,"admin: Collection #%" PRId32" is bad, delete failed.",
(int32_t)collnum);
return true;
}
CollectionRec *cr = m_recs [ collnum ];
if ( ! cr ) {
log(LOG_WARN, "admin: Collection id problem. Delete failed.");
g_errno = ENOTFOUND;
return true;
}
if ( g_process.isAnyTreeSaving() ) {
// note it
log("admin: tree is saving. waiting2.");
// all done
return false;
}
char *coll = cr->m_coll;
// note it
log(LOG_INFO,"db: deleting coll \"%s\" (%" PRId32")",coll,
(int32_t)cr->m_collnum);
cr->setNeedsSave();
// CAUTION: tree might be in the middle of saving
// we deal with this in Process.cpp now
// . TODO: remove from g_sync
// . remove from all rdbs
g_posdb.getRdb()->delColl ( coll );
g_titledb.getRdb()->delColl ( coll );
g_tagdb.getRdb()->delColl ( coll );
g_doledb.getRdb()->delColl ( coll );
g_clusterdb.getRdb()->delColl ( coll );
g_linkdb.getRdb()->delColl ( coll );
// reset spider info
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(collnum);
if ( sc ) {
// remove locks from lock table:
sc->clearLocks();
// you have to set this for tryToDeleteSpiderColl to
// actually have a shot at deleting it
sc->m_deleteMyself = true;
sc->setCollectionRec ( NULL );
// this will put it on "death row" so it will be deleted
// once Msg5::m_waitingForList/Merge is NULL
SpiderColl::tryToDeleteSpiderColl ( sc , "10" );
// don't let cr reference us anymore, sc is on deathrow
// and "cr" is delete below!
cr->m_spiderColl = NULL;
}
//////
//
// remove from m_recs[]
//
//////
setRecPtr ( cr->m_collnum , NULL );
// free it
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
delete ( cr );
// do not do this here in case spiders were outstanding
// and they added a new coll right away and it ended up getting
// recs from the deleted coll!!
//while ( ! m_recs[m_numRecs-1] ) m_numRecs--;
// update the time
//updateTime();
// done
return true;
}
// ensure m_recs[] is big enough for m_recs[collnum] to be a ptr
bool Collectiondb::growRecPtrBuf ( collnum_t collnum ) {
// an add, make sure big enough
int32_t need = ((int32_t)collnum+1)*sizeof(CollectionRec *);
int32_t have = m_recPtrBuf.length();
int32_t need2 = need - have;
// if already big enough
if ( need2 <= 0 ) {
m_recs [ collnum ] = NULL;
return true;
}
m_recPtrBuf.setLabel ("crecptrb");
// . true here means to clear the new space to zeroes
// . this shit works based on m_length not m_capacity
if ( ! m_recPtrBuf.reserve ( need2 ,NULL, true ) ) {
log( LOG_WARN, "admin: error growing rec ptr buf2.");
return false;
}
// sanity
if ( m_recPtrBuf.getCapacity() < need ) { g_process.shutdownAbort(true); }
// set it
m_recs = (CollectionRec **)m_recPtrBuf.getBufStart();
// update length of used bytes in case we re-alloc
m_recPtrBuf.setLength ( need );
// re-max
int32_t max = m_recPtrBuf.getCapacity() / sizeof(CollectionRec *);
// sanity
if ( collnum >= max ) { g_process.shutdownAbort(true); }
// initialize slot
m_recs [ collnum ] = NULL;
return true;
}
bool Collectiondb::setRecPtr ( collnum_t collnum , CollectionRec *cr ) {
// first time init hashtable that maps coll to collnum
if ( !g_collTable.isInitialized() &&
! g_collTable.set(8,sizeof(collnum_t), 256,NULL,0, false,"nhshtbl")) {
return false;
}
// sanity
if ( collnum < 0 ) { g_process.shutdownAbort(true); }
// sanity
int32_t max = m_recPtrBuf.getCapacity() / sizeof(CollectionRec *);
// set it
m_recs = (CollectionRec **)m_recPtrBuf.getBufStart();
// tell spiders to re-upadted the active list
g_spiderLoop.invalidateActiveList();
// a delete?
if ( ! cr ) {
// sanity
if ( collnum >= max ) { g_process.shutdownAbort(true); }
// get what's there
CollectionRec *oc = m_recs[collnum];
// let it go
m_recs[collnum] = NULL;
// if nothing already, done
if ( ! oc ) return true;
// tally it up
m_numRecsUsed--;
// delete key
int64_t h64 = hash64n(oc->m_coll);
// if in the hashtable UNDER OUR COLLNUM then nuke it
// otherwise, we might be called from resetColl2()
void *vp = g_collTable.getValue ( &h64 );
if ( ! vp ) return true;
collnum_t ct = *(collnum_t *)vp;
if ( ct != collnum ) return true;
g_collTable.removeKey ( &h64 );
return true;
}
// ensure m_recs[] is big enough for m_recs[collnum] to be a ptr
if ( ! growRecPtrBuf ( collnum ) ) {
return false;
}
// sanity
if ( cr->m_collnum != collnum ) { g_process.shutdownAbort(true); }
// add to hash table to map name to collnum_t
int64_t h64 = hash64n(cr->m_coll);
// debug
//log("coll: adding key %" PRId64" for %s",h64,cr->m_coll);
if ( ! g_collTable.addKey ( &h64 , &collnum ) )
return false;
// ensure last is NULL
m_recs[collnum] = cr;
// count it
m_numRecsUsed++;
//log("coll: adding key4 %" PRIu64" for coll \"%s\" (%" PRId32")",h64,cr->m_coll,
// (int32_t)i);
// reserve it
if ( collnum >= m_numRecs ) m_numRecs = collnum + 1;
// sanity to make sure collectionrec ptrs are legit
for ( int32_t j = 0 ; j < m_numRecs ; j++ ) {
if ( ! m_recs[j] ) continue;
if ( m_recs[j]->m_collnum == 1 ) continue;
}
return true;
}
// . returns false if we need a re-call, true if we completed
// . returns true with g_errno set on error
bool Collectiondb::resetColl2(collnum_t oldCollnum, collnum_t newCollnum) {
// do not allow this if in repair mode
if ( g_repair.isRepairActive() && g_repair.isRepairingColl(oldCollnum) ) {
log(LOG_WARN, "admin: Can not delete collection while in repair mode.");
g_errno = EBADENGINEER;
return true;
}
//log("admin: resetting collnum %" PRId32,(int32_t)oldCollnum);
// CAUTION: tree might be in the middle of saving
// we deal with this in Process.cpp now
if ( g_process.isAnyTreeSaving() ) {
// we could not complete...
return false;
}
CollectionRec *cr = m_recs [ oldCollnum ];
// reset spider info
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
if ( sc ) {
// remove locks from lock table:
sc->clearLocks();
// this will put it on "death row" so it will be deleted
// once Msg5::m_waitingForList/Merge is NULL
SpiderColl::tryToDeleteSpiderColl ( sc, "11" );
cr->m_spiderColl = NULL;
}
cr->m_spiderStatus = spider_status_t::SP_INITIALIZING; // this is 0
//cr->m_spiderStatusMsg = NULL;
// so XmlDoc.cpp can detect if the collection was reset since it
// launched its spider:
cr->m_lastResetCount++;
if ( newCollnum >= m_numRecs ) m_numRecs = (int32_t)newCollnum + 1;
// advance sanity check. did we wrap around?
// right now we #define collnum_t int16_t
if ( m_numRecs > 0x7fff ) { g_process.shutdownAbort(true); }
// make a new collnum so records in transit will not be added
// to any rdb...
cr->m_collnum = newCollnum;
////////
//
// ALTER m_recs[] array
//
////////
// Rdb::resetColl() needs to know the new cr so it can move
// the RdbBase into cr->m_bases[rdbId] array. recycling.
setRecPtr ( newCollnum , cr );
// a new directory then since we changed the collnum
char dname[512];
sprintf(dname, "%scoll.%s.%" PRId32"/",
g_hostdb.m_dir,
cr->m_coll,
(int32_t)newCollnum);
DIR *dir = opendir ( dname );
if ( dir ) {
closedir ( dir );
//g_errno = EEXIST;
log(LOG_WARN, "admin: Trying to create collection %s but directory %s already exists on disk.",
cr->m_coll,dname);
}
if ( ::mkdir ( dname, getDirCreationFlags() ) ) {
//g_errno = errno;
log(LOG_WARN, "admin: Creating directory %s had error: %s.",
dname,mstrerror(g_errno));
}
// . unlink all the *.dat and *.map files for this coll in its subdir
// . remove all recs from this collnum from m_tree/m_buckets
// . updates RdbBase::m_collnum
// . so for the tree it just needs to mark the old collnum recs
// with a collnum -1 in case it is saving...
g_posdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_titledb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_tagdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_doledb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_clusterdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_linkdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
// reset crawl status too!
cr->m_spiderStatus = spider_status_t::SP_INITIALIZING;
// . set m_recs[oldCollnum] to NULL and remove from hash table
// . do after calls to deleteColl() above so it wont crash
setRecPtr ( oldCollnum , NULL );
// save coll.conf to new directory
cr->save();
// done
return true;
}
// get coll rec specified in the HTTP request
CollectionRec *Collectiondb::getRec ( HttpRequest *r , bool useDefaultRec ) {
const char *coll = r->getString ( "c" );
if ( coll && ! coll[0] ) coll = NULL;
// default to main first
if ( ! coll && useDefaultRec ) {
CollectionRec *cr = g_collectiondb.getRec("main");
if ( cr ) return cr;
return getFirstRec ();
}
// give up?
if ( ! coll ) return NULL;
//if ( ! coll || ! coll[0] ) coll = g_conf.m_defaultColl;
return g_collectiondb.getRec ( coll );
}
const char *Collectiondb::getDefaultColl(const char *collname_from_httprequest) {
const char *coll = collname_from_httprequest;
if ( coll && ! coll[0] ) coll = NULL;
if ( coll ) return coll;
CollectionRec *cr = NULL;
// default to main first
if ( ! coll ) {
cr = g_collectiondb.getRec("main");
// CAUTION: cr could be deleted so don't trust this ptr
// if you give up control of the cpu
if ( cr ) return cr->m_coll;
}
// try next in line
if ( ! coll ) {
cr = getFirstRec ();
if ( cr ) return cr->m_coll;
}
// give up?
return NULL;
}
// . get collectionRec from name
// . returns NULL if not available
CollectionRec *Collectiondb::getRec ( const char *coll ) {
if ( ! coll ) coll = "";
return getRec ( coll , strlen(coll) );
}
CollectionRec *Collectiondb::getRec ( const char *coll , int32_t collLen ) {
if ( ! coll ) coll = "";
collnum_t collnum = getCollnum ( coll , collLen );
if ( collnum < 0 ) return NULL;
return m_recs [ (int32_t)collnum ];
}
CollectionRec *Collectiondb::getRec ( collnum_t collnum) {
if ( collnum >= m_numRecs || collnum < 0 ) {
// Rdb::resetBase() gets here, so don't always log.
// it is called from CollectionRec::reset() which is called
// from the CollectionRec constructor and ::load() so
// it won't have anything in rdb at that time
//log("colldb: collnum %" PRId32" > numrecs = %" PRId32,
// (int32_t)collnum,(int32_t)m_numRecs);
return NULL;
}
return m_recs[collnum];
}
CollectionRec *Collectiondb::getFirstRec ( ) {
for ( int32_t i = 0 ; i < m_numRecs ; i++ )
if ( m_recs[i] ) return m_recs[i];
return NULL;
}
collnum_t Collectiondb::getFirstCollnum() const {
for ( int32_t i = 0 ; i < m_numRecs ; i++ )
if ( m_recs[i] ) return i;
return (collnum_t)-1;
}
const char *Collectiondb::getCollName(collnum_t collnum) const {
if ( collnum < 0 || collnum > m_numRecs ) return NULL;
if ( ! m_recs[(int32_t)collnum] ) return NULL;
return m_recs[collnum]->m_coll;
}
collnum_t Collectiondb::getCollnum(const char *coll) const {
int32_t clen = 0;
if ( coll ) clen = strlen(coll );
return getCollnum ( coll , clen );
}
collnum_t Collectiondb::getCollnum ( const char *coll , int32_t clen ) const {
// default empty collection names
if ( coll && ! coll[0] ) coll = NULL;
if ( ! coll ) {
coll = g_conf.m_defaultColl;
clen = strlen(coll);
}
if ( ! coll[0] ) {
coll = "main";
clen = strlen(coll);
}
// because diffbot may have thousands of crawls/collections
// let's improve the speed here. try hashing it...
int64_t h64 = hash64(coll,clen);
void *vp = g_collTable.getValue ( &h64 );
if ( ! vp ) return -1; // not found
return *(collnum_t *)vp;
}
// what collnum will be used the next time a coll is added?
collnum_t Collectiondb::reserveCollNum ( ) {
if ( m_numRecs < 0x7fff ) {
collnum_t next = m_numRecs;
// make the ptr NULL at least to accomodate the
// loop that scan up to m_numRecs lest we core
growRecPtrBuf ( next );
m_numRecs++;
return next;
}
// collnum_t is signed right now because we use -1 to indicate a
// bad collnum.
int32_t scanned = 0;
// search for an empty slot
for ( int32_t i = m_wrapped ; ; i++ ) {
// because collnum_t is 2 bytes, signed, limit this here
if ( i > 0x7fff ) i = 0;
// how can this happen?
if ( i < 0 ) i = 0;
// if we scanned the max # of recs we could have, we are done
if ( ++scanned >= m_numRecs ) break;
// skip if this is in use
if ( m_recs[i] ) continue;
// start after this one next time
m_wrapped = i+1;
// note it
log("colldb: returning wrapped collnum of %d", i);
return (collnum_t)i;
}
log("colldb: no new collnum available. consider upping collnum_t");
// none available!!
return -1;
}
///////////////
//
// COLLECTIONREC
//
///////////////
//#include "CollectionRec.h"
CollectionRec::CollectionRec()
: m_word_variations_config()
{
m_spiderCorruptCount = 0;
m_collnum = -1;
m_coll[0] = '\0';
memset(m_bases, 0, sizeof(m_bases));
// how many keys in the tree of each rdb? we now store this stuff
// here and not in RdbTree.cpp because we no longer have a maximum
// # of collection recs... MAX_COLLS. each is a 32-bit "int32_t" so
// it is 4 * RDB_END...
memset(m_numNegKeysInTree, 0, sizeof(m_numNegKeysInTree));
memset(m_numPosKeysInTree, 0, sizeof(m_numPosKeysInTree));
m_spiderColl = NULL;
m_overflow = 0x12345678;
m_overflow2 = 0x12345678;
// the spiders are currently uninhibited i guess
m_spiderStatus = spider_status_t::SP_INITIALIZING; // this is 0
// inits for sortbydatetable
m_msg5 = NULL;
// JAB - track which regex parsers have been initialized
//log(LOG_DEBUG,"regex: %p initalizing empty parsers", m_pRegExParser);
// clear these out so Parms::calcChecksum can work and so Parms.cpp doesn't work with uninitialized data
//m_regExs: ctor() done
clearUrlFilters();
//m_requests = 0;
//m_replies = 0;
//m_doingCallbacks = false;
m_lastResetCount = 0;
// for diffbot caching the global spider stats
reset();
// Coverity
m_nextActive = NULL;
m_needsSave = false;
m_urlFiltersHavePageCounts = false;
m_collLen = 0;
m_dailyMergeStarted = 0;
m_dailyMergeTrigger = 0;
memset(m_dailyMergeDOWList, 0, sizeof(m_dailyMergeDOWList));
m_spideringEnabled = true;
m_spiderDelayInMilliseconds = 0;
m_spiderReindexDelayMS = 0;
m_isActive = false;
m_makeImageThumbnails = false;
m_thumbnailMaxWidthHeight = 0;
m_indexBody = false;
m_dedupingEnabled = false;
m_dedupURLByDefault = false;