-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex.js
185 lines (140 loc) · 5.65 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
'use strict';
const fs = require('fs');
const path = require('path');
const cheerio = require('cheerio');
const progress = require('progress');
const { performance } = require('perf_hooks');
// Coded by @mguidoti
const MG = require('./attr');
const xmldir = '/Users/punkish/Projects/zenodeo/data/treatmentsDump';
const xmlfiles = fs.readdirSync(xmldir);
let uniqTags = {};
let uniqTagsAttribs = {};
let uniqTagsAttribsPairs = {};
let treatmentIDs = {};
let resultingHash = {};
const t0 = performance.now();
let i = 0;
let j = xmlfiles.length;
// update the progress bar every 10% of the total num of files
const tickInterval = Math.floor( j / (j * 0.10) );
const bar = new progress(' processing [:bar] :rate files/sec :percent :etas', {
complete: '=',
incomplete: ' ',
width: 30,
total: j
});
for (; i < j; i++) {
const treatmentID = xmlfiles[i].split('.')[0];
const $ = cheerio.load(fs.readFileSync(path.join(xmldir, `${xmlfiles[i]}`), 'utf8'), {
normalizeWhitespace: true,
xmlMode: true
});
// update progress bar every tickInterval
if (!(i % tickInterval)) {
bar.tick(tickInterval)
}
$('*')
.contents()
.filter((i, e) => { return e.type === 'tag' })
.map((i, e) => {
const tag = e.name;
const attribs = Object.keys(e.attribs).sort();
let attributesList;
let tagAttribsPairs = [];
if (attribs.length) {
attributesList = attribs.join(',');
// tag and attribute pairs
attribs.forEach(attrib => {
tagAttribsPairs.push([tag, attrib]);
})
}
else {
// if there are no attributes, we default the output to '-'
attributesList = 'no attributes';
// tag and attribute pairs
tagAttribsPairs.push([tag, 'no attributes']);
}
const tagAttribs = tag + '\t' + attributesList;
// create a tagAttribsPairsStr by joining the tagAttribsPairs
// with a '*'
const tagAttribsPairsStr = tagAttribsPairs.join('*');
// get variance on attributes
MG.attr(resultingHash, e);
// Store the name of the file (really the treatmentID) keyed
// by the unique tags or tagAttribs or tagAttribsPairsStr
treatmentIDs[tag] = treatmentID;
treatmentIDs[tagAttribs] = treatmentID;
treatmentIDs[tagAttribsPairsStr] = treatmentID;
// if the return value in cheerio's .map() is an array, the value
// seems to get flattened into a comma-separated list. So, to
// maintain the unique values, we make a string delimiting the
// values with a '|'
return tag + '|' + tagAttribs + '|' + tagAttribsPairsStr;
})
.get()
.forEach((e, i) => {
// split the string on '|' to get back the separate values
// returned from .map (see above)
const [tag, tagAttribs, tagAttribsPairsStr] = e.split('|');
// calculate frequency of occurance
uniqTags[ tag ] = uniqTags[ tag ] ? uniqTags[ tag ] + 1 : 1;
uniqTagsAttribs[ tagAttribs ] = uniqTagsAttribs[ tagAttribs ] ? uniqTagsAttribs[ tagAttribs ] + 1 : 1;
// frequency of tag-attribute pairs is trickier because one
// tag can generate many pairs. So we store the frequency and
// the name of the file in an array. That way we can extact
// these values later
const xml = treatmentIDs[tagAttribsPairsStr];
tagAttribsPairsStr.split('*').forEach(ta => {
if (uniqTagsAttribsPairs[ ta ]) {
uniqTagsAttribsPairs[ ta ] = [uniqTagsAttribsPairs[ ta ][0] + 1, xml];
}
else {
uniqTagsAttribsPairs[ ta ] = [1, xml];
}
})
});
}
// alphabetically sorted list of only tags
let headers = ['tag', 'xml', 'frequency'];
let rep = headers.join('\t') + '\n';
Object.keys(uniqTags)
.sort()
.forEach(tag => {
const row = [tag, treatmentIDs[tag], uniqTags[tag]];
rep += row.join('\t') + '\n';
});
fs.writeFileSync('reports/tags.tsv', rep, 'utf8');
// alphabetically sorted list of tags-attributes
headers = ['tag', 'attributes', 'xml', 'frequency'];
rep = headers.join('\t') + '\n';
Object.keys(uniqTagsAttribs)
.sort()
.forEach(tagAttribs => {
const row = [tagAttribs, treatmentIDs[tagAttribs], uniqTagsAttribs[tagAttribs]];
rep += row.join('\t') + '\n';
});
fs.writeFileSync('reports/tags-attributes.tsv', rep, 'utf8');
// alphabetically sorted list of tags-attributes
headers = ['tag', 'attributes', 'xml', 'frequency'];
rep = headers.join('\t') + '\n';
Object.keys(uniqTagsAttribsPairs)
.sort()
.forEach(t => {
const [tag, attribute] = t.split(',');
const [freq, xml] = [ uniqTagsAttribsPairs[t][0], uniqTagsAttribsPairs[t][1] ];
const row = [tag, attribute, freq, xml];
rep += row.join('\t') + '\n';
});
fs.writeFileSync('reports/tagAttribute-pairs.tsv', rep, 'utf8');
headers = ['tag with attr="type"', 'frequency'];
rep = headers.join('\t') + '\n';
Object.keys(resultingHash)
.sort()
.forEach(t => {
const row = [t, resultingHash[t]]
rep += row.join('\t') + '\n';
});
fs.writeFileSync('reports/variance-on-attr-type.tsv', rep, 'utf8');
const t1 = performance.now();
console.log(`extracted unique tags from ${j} files in ${(t1 - t0).toFixed(2)} ms`);