-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun-load.py
63 lines (53 loc) · 1.53 KB
/
run-load.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
import sys
import time
import datetime
from pipeline.config import Config
from dotenv import load_dotenv
load_dotenv()
basepath = os.getenv('LUX_BASEPATH', "")
cfgs = Config(basepath=basepath)
cfgs.cache_globals()
cfgs.instantiate_all()
### STOP. You probably want to use MLCP instead
ml = cfgs.results['marklogic']['recordcache']
store = cfgs.marklogic_stores['ml_sandbox']['store']
total = ml.len_estimate()
BATCH_SIZE = 200
if len(sys.argv) > 2:
my_slice = int(sys.argv[1])
max_slice = int(sys.argv[2])
else:
my_slice = 0
max_slice = 1
to_do = int(total / max_slice)
done = 0
curr_amt = 0.0
prev_pct = 0
batch = []
start = time.time()
for rec in ml.iter_records_slice(my_slice, max_slice):
rec = rec['data']
batch.append(rec)
if len(batch) >= BATCH_SIZE:
store.update_multiple(batch)
done += len(batch)
if done / to_do > curr_amt:
ct = time.time()
durn = ct - start
now = datetime.datetime.utcnow().isoformat()
if prev_pct:
diff = int(ct - prev_pct)
else:
diff = 0
prev_pct = ct
persec = done / durn
total_secs = to_do / persec
end = start + total_secs
expected = datetime.datetime.fromtimestamp(end)
print(f"[{now}] {done}/{to_do} = {curr_amt * 100}% last:{diff} per sec: {persec} finish: {expected}")
sys.stdout.flush()
curr_amt += 0.005
batch = []
if batch:
store.update_multiple(batch)