-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbdc.py
47 lines (38 loc) · 1.34 KB
/
bdc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import logging
import math
import multiprocessing as mp
import numpy as np
import pandas as pd
def xlogx(x):
if x == 0:
return 0
return x*math.log2(x)
def categories(dat):
return sorted(set(dat.index))
def calc_bdc_parallel(state):
[terms_to_process, f_t_ci, f_ci, cs] = state
# Note:
# at the first time I initializered `out` as a pd.DataFrame,
# and then add result by `out.loc[t, 'bdc'] = bdc`, which is
# extremely slow since I called `loc` and tried to search by
# `[t, 'bdc']`
out = []
for t in terms_to_process:
p_t_ci = {c: f_t_ci[c][t]/f_ci[c] for c in cs}
G_t_ci = {c: p_t_ci[c]/sum(p_t_ci.values()) for c in cs}
F_t_ci = {c: xlogx(G_t_ci[c]) for c in cs}
BH_t = -sum(F_t_ci.values())
bdc = 1-BH_t/math.log2(len(cs))
out.append(bdc)
return pd.DataFrame(out, index=terms_to_process, columns=['bdc'])
def calc_bdc(dat, PARALLEL=4):
cs = categories(dat)
f_t_ci = {c: dat.loc[c].sum() for c in cs}
f_ci = {c: f_t_ci[c].sum() for c in cs}
states = [[terms_to_process, f_t_ci, f_ci, cs]
for terms_to_process in np.array_split(dat.columns,
PARALLEL)]
po = mp.Pool(processes=PARALLEL)
res = po.map(calc_bdc_parallel, states)
po.close()
return pd.concat(res)