-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathdescribe.py
executable file
·158 lines (128 loc) · 4.16 KB
/
describe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python
"""Compute some basic statistics on numerical data, read as one data
point (float) per line."""
#--- standard library imports
#
import sys
#--- third-party imports
#
import numpy
from scipy.stats import scoreatpercentile, describe
#--- project specific imports
#
# /
__author__ = "Andreas Wilm"
__version__ = "0.1"
__email__ = "[email protected]"
__license__ = "The MIT License (MIT)"
def adv_stats(arr):
"""FIXME:add-doc
"""
print "sum:\t%f" % (arr.sum())
# scipy.stats.describe already covers a lot
describe_names = ["size", "(min, max)", "arithmetic mean",
"unbiased variance", "biased skewness",
"biased kurtosis"]
describe_result = dict(zip(describe_names, describe(arr)))
for k in describe_names:
v = describe_result[k]
if k == "(min, max)":
# used later
min = v[0]
max = v[1]
continue
print "%s: %f" % (k, v)
# ...but not percentiles
median = scoreatpercentile(arr, 50)
lower_q = (scoreatpercentile(arr, 25)) # q1
upper_q = (scoreatpercentile(arr, 75)) # q3
iqr = upper_q-lower_q
std = numpy.std(arr)
print "std:\t%f" % (std)
#whisker = 1.5*iqr
#print "IQR:\t%f" % iqr
for p in [1, 5, 10, 90, 95, 99]:
print "%dth percentile:\t%f" % (p, scoreatpercentile(arr, p))
print "# five number summary"
# FIXME different to http://en.wikipedia.org/wiki/Five-number_summary
# echo 0, 0, 1, 2, 63, 61, 27, 13, | tr ',' '\n' | describe.py
# quartiles differ
print "min:\t%f" % (min)
print "q1:\t%f" % (lower_q)
#print "lower hinge:\t%f" % (median-whisker)
print "median:\t%f" % (median)
#print "upper hinge:\t%f" % (median+whisker)
print "q3:\t%f" % (upper_q)
print "max:\t%f" % (max)
def basic_stats(arr):
"""Compute basic statistics on the given data
...
res_list = basic_stats(arr)
for (name, res) in res_list:
print "%s\t%f" % (name, res)
NOTE: just use scipy.stats.describe instead
"""
assert arr.ndim == 1
# return
result_list = []
# list of functions to call on data, including an extra arg and
# the functions name
func_list = [
(len, None, "length"),
(numpy.min, None, "min"),
(numpy.max, None, "max"),
(numpy.sum, None, "sum"),
(numpy.mean, None, "mean"),
(numpy.std, None, "stdv"),
]
for p in [1, 5, 25, 50, 75, 95, 99]:
if p == 50:
name = "median"
elif p == 75:
name = "q3"
elif p == 25:
name = "q1"
else:
name = "%dth %%tile" % p
func_list.append((numpy.percentile, p, name))
# call each function and store its name as well as the result
for (func, arg, name) in func_list:
if arg:
result_list.append((name, func(arr, arg)))
else:
result_list.append((name, func(arr)))
return result_list
def main():
"""main function
"""
VALID_MODES = ['basic', 'advanced']
if len(sys.argv) < 2:
sys.stderr.write("ERROR: Need at least mode argument (one of %s)\n" % (
', '.join(VALID_MODES)))
sys.exit(1)
mode = sys.argv[1]
if mode not in VALID_MODES:
sys.stderr.write("ERROR: First arg needs to be mode (one of %s)\n" % (
', '.join(VALID_MODES)))
sys.exit(1)
if len(sys.argv) == 3 and sys.argv[2] != "-":
fh = open(sys.argv[2], 'r')
else:
fh = sys.stdin
# using an iterable seems to be the most efficient way to
# dynamically grow an array
#
iterable = (float(line) for line in fh if len(line.strip())>0)
arr = numpy.fromiter(iterable, numpy.float)
if mode == "basic":
for (rname, rval) in basic_stats(arr):
print "%s\t%s" % (rname, rval)
elif mode == "advanced":
adv_stats(arr)
else:
raise ValueError, ("Unknown mode %s" % mode)
#print "DEBUG: %s" % arr
if fh != sys.stdin:
fh.close()
if __name__ == "__main__":
main()