-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathRefPhyloTest.drw
187 lines (166 loc) · 7.45 KB
/
RefPhyloTest.drw
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
##
# A script to compare pairwise orthologs from reference gene trees
# with predicted orthologs for benchmarking
#
# Adrian Altenhoff, Sep 2006
# rewritten Adrian Altenhoff, Jul 13, 2007
# rewritten for BenchService Adrian Altenhoff, Dec 4, 2009
# rewritten for EBench Adrian Altenhoff, Jun 2019
#
# input arguments:
# refset_path path to reference dataset
# project_db path to predictions in darwin db format
# title name of the method which is evaluated
# testset type of trees, i.e. SwissTrees or TreeFam-A
# community_id community id
# assessment_fname filename where the assessment file should be written
# measure similarity measure, e.g 'avg Schlicker'
# out_dir directory where output is written to. must exist
#
printlevel := 3;
Set(printgc=false):
if not assigned(refset_path) then
error('refset_path not assigned');
fi:
if not assigned(community_id) or not assigned(assessment_fname) or not assigned(out_dir) then
error('community_id, assessment_fname and out_dir all must be defined');
fi:
#Assigns ReconciledTrees (a table)
ReadProgram(refset_path.'/ReconciledTrees_'.testset.'.drw');
cases := [op(Indices(ReconciledTrees))];
nrCases := length(cases);
lprint('nr of cases:', nrCases);
ComputePerformance := proc()
# false positive and true positive rates.
# - set to 0 to indicate no data
# - set to list [fpr, var(fpr)] once data is stored
fpr := CreateArray(1..nrCases, 0);
tpr := CreateArray(1..nrCases, 0);
DUPL := {':D=Y','D=Y','D','DUPLICATION'};
SPEC := {':D=N','D=N','S','SPECIATION'};
rawData := CreateArray(1..nrCases, []):
Logger('start computing RefPhyloTest Performance...', 'DEBUG');
for cNr to nrCases do
rels := ReconciledTrees[cases[cNr],'Relations']:
prts := ReconciledTrees[cases[cNr],'MappedProts'];
nProt := length(prts);
if nProt>5 then
dups := table({}); specs := table({});
for rel in Indices(rels) do
ev := rels[rel];
if member(ev,DUPL) then dups[rel[1]] := append(dups[rel[1]], rel[2]);
elif member(ev,SPEC) then specs[rel[1]] := append(specs[rel[1]], rel[2]);
else warning('ignoring relation: '.string([rel, ev])); fi:
od:
# iteration over the proteins in current reconciled tree
fp := tp := fn := tn := 1; #uniform prior probability
raw := []:
for eNr1 in prts do
vps := ParseLongList(SearchTag('VP',Entry(eNr1)));
vps := {op(vps)} intersect prts;
tpp := intersect(vps, specs[eNr1]):
tp := tp + length(tpp)/2;
raw := append(raw, seq([eNr1, If(z>eNr1, z, NULL), 'TP'], z=tpp) ):
tnp := minus(dups[eNr1], vps):
tn := tn + length(tnp)/2;
raw := append(raw, seq([eNr1, If(z>eNr1, z, NULL), 'TN'], z=tnp) ):
fpp := intersect(vps, dups[eNr1]);
fp := fp + length(fpp)/2;
raw := append(raw, seq([eNr1, If(z>eNr1, z, NULL), 'FP'], z=fpp) ):
fnp := minus(specs[eNr1], vps):
fn := fn + length(fnp)/2;
raw := append(raw, seq([eNr1, If(z>eNr1, z, NULL), 'FN'], z=fnp) ):
od:
# compute the ppv and tpr and their variances
p := tp/(tp+fp); # positive predictive value
fpr[cNr] := [p, p*(1-p)/(tp+fp)];
p := tp/(tp+fn); # true positive rate
tpr[cNr] := [p, p*(1-p)/(tp+fn)];
rawData[cNr] := raw;
Logger(sprintf('[RefPhyloTest]: finished evaluating case "%s": (%d proteins)', cases[cNr], nProt), 'INFO');
else
Logger(sprintf('[RefPhyloTest]: too few (%d) proteins in %a\n', nProt, cases[cNr]), 'INFO');
fi:
od:
return( [fpr, tpr, rawData] );
end:
StoreRawData := proc(rawData, name, fname_)
fname := fname_;
if length(fname) > 4 and fname[-3..-1] = '.gz' then
fname := fname[1..-4];
do_gzip := true;
else do_gzip := false fi:
OpenWriting(fname);
printf('# Dataset<tab>Protein ID 1<tab>Protein ID 2<tab>Correctness (TP:True '.
'positive, FP: False positive, TN: True Negative. FN: False Negative)\n');
for i to nrCases do
pNam := cases[i];
for z in rawData[i] do
id1 := ENr2XRef(z[1]);
id2 := ENr2XRef(z[2]);
printf('%s\t%s\t%s\t%s\n', pNam, id1, id2, z[3]);
od:
od:
OpenWriting(previous);
if do_gzip then CallSystem('gzip -9f '.fname); fi
end:
StoreResult := proc(fn:string, data)
OpenWriting(fn): prints(json(data)): OpenWriting(previous);
end:
DB := projDB := ReadDb(project_db);
title_id := ReplaceString(' ','-', ReplaceString('_', '-', title));
raw_out_fn := sprintf('RefPhylo-%s_%s_%s_raw.txt.gz', testset, title_id, sha2(string([testset, title, project_db]))[1..12]);
result := table():
result['testset'] := testset:
result['raw_data_fn'] := raw_out_fn:
perf := ComputePerformance():
PPV := perf[1]: TPR := perf[2]: raw_data := perf[3]:
print(TPR);
fams := []:
assessments := []:
if nrCases > 1 then
for cNr to nrCases do
per_fam := table():
per_fam['case'] := cases[cNr];
recall := table(): precision := table():
print(TPR[cNr], TPR[cNr,1], TPR[cNr,2], PPV[cNr]);
if TPR[cNr]<>0 then
recall['name'] := 'true positive rate';
recall['value'] := TPR[cNr, 1];
recall['stderr'] := 1.96 * sqrt(TPR[cNr, 2]);
precision['name'] := 'positive predictive value rate';
precision['value'] := PPV[cNr, 1];
precision['stderr'] := 1.96 * sqrt(PPV[cNr, 2]);
fi:
per_fam['recall_measures'] := [recall];
per_fam['precision_measures'] := [precision];
fams := append(fams, per_fam);
challenge := testset.'-'.cases[cNr];
assessments := append(assessments,
AssessmentDataset(community_id, challenge, title, 'TPR', TPR[cNr, 1], 1.96 * sqrt(TPR[cNr, 2])),
AssessmentDataset(community_id, challenge, title, 'PPV', PPV[cNr, 1], 1.96 * sqrt(PPV[cNr, 2])));
od:
result['per_case_results'] := fams:
fi;
recall := table(): precision := table():
recall['name'] := 'true positive rate';
precision['name'] := 'positive predictive value rate';
nr_valid_cases := sum(If(TPR[c]<>0, 1, 0), c=1..nrCases);
if nr_valid_cases > 0 then
precision['value'] := avg(seq(If(PPV[c] <> 0, PPV[c,1], NULL), c=1..nrCases));
precision['stderr'] := 1.96*sqrt( sum([seq(If(PPV[c] <> 0, PPV[c,2], NULL), c=1..nrCases)]) ) / nr_valid_cases;
recall['value'] := avg(seq(If(TPR[c] <> 0, TPR[c,1], NULL), c=1..nrCases));
recall['stderr'] := 1.96*sqrt( sum([seq(If(TPR[c] <> 0, TPR[c,2], NULL), c=1..nrCases)]) ) / nr_valid_cases;
else
recall['value'] := precision['value'] := 0.5;
recall['stderr'] := precision['stderr'] := 1;
fi:
result['recall_measures'] := [recall];
result['precision_measures'] := [precision];
assessments := append(assessments,
AssessmentDataset(community_id, testset, title, 'TPR', recall['value'], recall['stderr']),
AssessmentDataset(community_id, testset, title, 'PPV', precision['value'], precision['stderr']));
StoreRawData(raw_data, title, out_dir.'/'.result['raw_data_fn']);
#StoreResult(out_dir.'/result.json', result);
StoreResult(assessment_fname, assessments);
done;