forked from JelleAalbers/multihist
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmultihist.py
793 lines (658 loc) · 31.5 KB
/
multihist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
from __future__ import division
from copy import deepcopy
from functools import reduce
try:
from itertools import izip as zip
except ImportError:
# Hello, python 3!
pass
import numpy as np
try:
from scipy.ndimage import zoom
HAVE_SCIPY = True
except ImportError:
HAVE_SCIPY = False
try:
import matplotlib
import matplotlib.pyplot as plt
CAN_PLOT = True
except ImportError:
plt = None
CAN_PLOT = False
COLUMNAR_DATA_SOURCES = []
try:
import dask
import dask.dataframe
import dask.multiprocessing
WE_HAVE_DASK = True
DEFAULT_DASK_COMPUTE_KWARGS = dict(get=dask.multiprocessing.get)
COLUMNAR_DATA_SOURCES.append(dask.dataframe.DataFrame)
except Exception: # Sometimes dask import succeeds, but throws error when starting up
WE_HAVE_DASK = False
pass
try:
import pandas as pd
COLUMNAR_DATA_SOURCES.append(pd.DataFrame)
except ImportError:
pass
COLUMNAR_DATA_SOURCES = tuple(COLUMNAR_DATA_SOURCES)
from operator import itemgetter
__version__ = '0.6.0'
class CoordinateOutOfRangeException(Exception):
pass
class MultiHistBase(object):
def similar_blank_hist(self):
newhist = deepcopy(self)
newhist.histogram = np.zeros_like(self.histogram)
return newhist
@property
def n(self):
"""Returns number of data points loaded into histogram"""
return np.sum(self.histogram)
# Overload binary numeric operators to work on histogram
# TODO: logical operators
def __getitem__(self, item):
return self.histogram[item]
def __setitem__(self, key, value):
self.histogram[key] = value
# Let unary operators work on wrapped histogram:
def min(self):
return self.histogram.min()
def max(self):
return self.histogram.max()
def __len__(self):
return len(self.histogram)
def __neg__(self):
return self.__class__.from_histogram(-self.histogram, self.bin_edges, self.axis_names)
def __pos__(self):
return self.__class__.from_histogram(+self.histogram, self.bin_edges, self.axis_names)
def __abs__(self):
return self.__class__.from_histogram(abs(self.histogram), self.bin_edges, self.axis_names)
def __invert__(self):
return self.__class__.from_histogram(~self.histogram, self.bin_edges, self.axis_names)
# Let binary operators work on wrapped histogram
@classmethod
def _make_binop(cls, opname):
def binop(self, other):
return self.__class__.from_histogram(
getattr(self.histogram, opname)(other),
self.bin_edges,
self.axis_names)
return binop
for methodname in 'add sub mul div truediv floordiv mod divmod pow lshift rshift and or'.split():
dundername = '__%s__' % methodname
setattr(MultiHistBase,
dundername,
MultiHistBase._make_binop(dundername))
setattr(MultiHistBase,
'__r%s__' % methodname,
getattr(MultiHistBase, dundername))
# Verbose alias
MultiHistBase.similar_blank_histogram = MultiHistBase.similar_blank_hist
class Hist1d(MultiHistBase):
axis_names = None
dimensions = 1
@classmethod
def from_histogram(cls, histogram, bin_edges, axis_names=None):
"""Make a Hist1D from a numpy bin_edges + histogram pair
:param histogram: Initial histogram
:param bin_edges: Bin edges of histogram. Must be one longer than length of histogram
:param axis_names: Ignored. Sorry :-)
:return:
"""
if len(bin_edges) != len(histogram) + 1:
raise ValueError("Bin edges must be of length %d, you gave %d!" % (len(histogram) + 1, len(bin_edges)))
self = cls(bins=bin_edges)
self.histogram = np.array(histogram)
return self
def __init__(self, data=None, bins=10, range=None, weights=None):
"""
:param data: Initial data to histogram.
:param bins: Number of bins, or list of bin edges (like np.histogram)
:param weights: Weights for initial data.
:param range: Range of histogram.
:return: None
"""
if data is None:
data = []
self.histogram, self.bin_edges = np.histogram(data, bins=bins, range=range, weights=weights)
def add(self, data, weights=None):
hist, _ = np.histogram(data, self.bin_edges, weights=weights)
self.histogram += hist
@property
def bin_centers(self):
return 0.5 * (self.bin_edges[1:] + self.bin_edges[:-1])
def bin_volumes(self):
return np.diff(self.bin_edges)
@property
def density(self):
"""Gives emprical PDF, like np.histogram(...., density=True)"""
h = self.histogram.astype(np.float)
bindifs = np.array(np.diff(self.bin_edges), float)
return h / (bindifs * self.n)
@property
def normalized_histogram(self):
"""Gives histogram with sum of entries normalized to 1."""
return self.histogram / self.n
@property
def cumulative_histogram(self):
return np.cumsum(self.histogram)
@property
def cumulative_density(self):
cs = np.cumsum(self.histogram)
return cs / cs[-1]
def get_random(self, size=10):
"""Returns random variates from the histogram.
Note this assumes the histogram is an 'events per bin', not a pdf.
Inside the bins, a uniform distribution is assumed.
"""
bin_i = np.random.choice(np.arange(len(self.bin_centers)), size=size, p=self.normalized_histogram)
return self.bin_centers[bin_i] + np.random.uniform(-0.5, 0.5, size=size) * self.bin_volumes()[bin_i]
def items(self):
"""Iterate over (bin_center, hist_value) from left to right"""
return zip(self.bin_centers, self.histogram)
@property
def mean(self):
"""Estimates mean of underlying data, assuming each datapoint was exactly in the center of its bin."""
return np.average(self.bin_centers, weights=self.histogram)
@property
def std(self, bessel_correction=True):
"""Estimates std of underlying data, assuming each datapoint was exactly in the center of its bin."""
if bessel_correction:
n = self.n
bc = n / (n - 1)
else:
bc = 1
return np.sqrt(np.average((self.bin_centers - self.mean) ** 2, weights=self.histogram)) * bc
def plot(self, normed=False, scale_errors_by=1.0, scale_histogram_by=1.0, plt=plt, errors=False, **kwargs):
"""Plots the histogram with Poisson (sqrt(n)) error bars
- scale_errors_by multiplies the error bars by its argument
- scale_histogram_by multiplies the histogram AND the error bars by its argument
- plt thing to call .errorbar on (pylab, figure, axes, whatever the matplotlib guys come up with next)
"""
if not CAN_PLOT:
raise ValueError("matplotlib did not import, so can't plot your histogram...")
if errors:
kwargs.setdefault('linestyle', 'none')
yerr = np.sqrt(self.histogram)
if normed:
y = self.normalized_histogram
yerr /= self.n
else:
y = self.histogram.astype(np.float)
yerr *= scale_errors_by * scale_histogram_by
y *= scale_histogram_by
plt.errorbar(self.bin_centers, y, yerr,
marker='.', **kwargs)
else:
# Note steps-pre: plotting vs centers and using
# steps-mid is problematic:
# * the steps won't be correct for log scales
# * the final bins will not fully show
kwargs.setdefault('linestyle', 'steps-pre')
x = self.bin_edges
y = self.lookup(x)
plt.plot(x, y, **kwargs)
def percentile(self, percentile):
"""Return bin center nearest to percentile"""
return self.bin_centers[np.argmin(np.abs(self.cumulative_density * 100 - percentile))]
def lookup(self, coordinates):
"""Lookup values at coordinates.
coordinates: arraylike of coordinates.
Clips if out of range!! TODO: option to throw exception instead.
TODO: Needs tests!!
"""
# Convert coordinates to indices
index_array = np.clip(np.searchsorted(self.bin_edges, coordinates) - 1,
0,
len(self.bin_edges) - 2)
# Use the index array to slice the histogram
return self.histogram[index_array]
class Histdd(MultiHistBase):
"""multidimensional histogram object
"""
axis_names = None
@classmethod
def from_histogram(cls, histogram, bin_edges, axis_names=None):
"""Make a HistdD from numpy histogram + bin edges
:param histogram: Initial histogram
:param bin_edges: x bin edges of histogram, y bin edges, ...
:return: Histnd instance
"""
bin_edges = np.array(bin_edges)
self = cls(bins=bin_edges, axis_names=axis_names)
self.histogram = histogram
return self
def __init__(self, *data, **kwargs):
for k, v in {'bins': 10, 'range': None, 'weights': None, 'axis_names': None}.items():
kwargs.setdefault(k, v)
# dimensions is a shorthand [(axis_name_1, bins_1), (axis_name_2, bins_2), ...]
if 'dimensions' in kwargs:
kwargs['axis_names'], kwargs['bins'] = zip(*kwargs['dimensions'])
if len(data) == 0:
if kwargs['range'] is None:
if kwargs['bins'] is None:
raise ValueError("Must specify data, bins, or range")
try:
dimensions = len(kwargs['bins'])
except TypeError:
raise ValueError("If you specify no data and no ranges, must specify a bin specification "
"which tells me what dimension you want. E.g. [10, 10, 10] instead of 10.")
else:
dimensions = len(kwargs['range'])
data = np.zeros((0, dimensions)).T
self.axis_names = kwargs['axis_names']
self.histogram, self.bin_edges = self._data_to_hist(data, **kwargs)
def add(self, *data, **kwargs):
self.histogram += self._data_to_hist(data, **kwargs)[0]
@staticmethod
def _is_columnar(x):
if isinstance(x, COLUMNAR_DATA_SOURCES):
return True
if isinstance(x, np.ndarray) and x.dtype.fields:
return True
return False
def _data_to_hist(self, data, **kwargs):
"""Return bin_edges, histogram array"""
if hasattr(self, 'bin_edges'):
kwargs.setdefault('bins', self.bin_edges)
if len(data) == 1 and self._is_columnar(data[0]):
data = data[0]
if self.axis_names is None:
raise ValueError("When histogramming from a columnar data source, "
"axis_names or dimensions is mandatory")
is_dask = False
if WE_HAVE_DASK:
is_dask = isinstance(data, dask.dataframe.DataFrame)
if is_dask:
fake_histogram = Histdd(axis_names=self.axis_names, bins=kwargs['bins'])
partial_hists = []
for partition in data.to_delayed():
ph = dask.delayed(Histdd)(partition, axis_names=self.axis_names, bins=kwargs['bins'])
ph = dask.delayed(lambda x: x.histogram)(ph)
ph = dask.array.from_delayed(ph,
shape=fake_histogram.histogram.shape,
dtype=fake_histogram.histogram.dtype)
partial_hists.append(ph)
partial_hists = dask.array.stack(partial_hists, axis=0)
compute_options = kwargs.get('compute_options', {})
for k, v in DEFAULT_DASK_COMPUTE_KWARGS.items():
compute_options.setdefault(k, v)
histogram = partial_hists.sum(axis=0).compute(**compute_options)
bin_edges = fake_histogram.bin_edges
return histogram, bin_edges
else:
data = np.vstack([data[x].values if isinstance(data, pd.DataFrame) else data[x]
for x in self.axis_names])
data = np.array(data).T
return np.histogramdd(data,
bins=kwargs.get('bins'),
weights=kwargs.get('weights'),
range=kwargs.get('range'))
@property
def dimensions(self):
return len(self.bin_edges)
##
# Axis selection
##
def get_axis_number(self, axis):
if isinstance(axis, int):
return axis
if isinstance(axis, str):
if self.axis_names is None:
raise ValueError("Axis name %s not in histogram: histogram has no named axes." % axis)
if axis in self.axis_names:
return self.axis_names.index(axis)
raise ValueError("Axis name %s not in histogram. Axis names which are: %s" % (axis, self.axis_names))
raise ValueError("Argument to get_axis_number should be string or integer, but you gave %s" % axis)
def other_axes(self, axis):
axis = self.get_axis_number(axis)
return tuple([i for i in range(self.dimensions) if i != axis])
def axis_names_without(self, axis):
"""Return axis names without axis, or None if axis_names is None"""
if self.axis_names is None:
return None
return itemgetter(*self.other_axes(axis))(self.axis_names)
##
# Bin wrangling: centers <-> edges, values <-> indices
##
def bin_centers(self, axis=None):
"""Return bin centers along an axis, or if axis=None, list of bin_centers along each axis"""
if axis is None:
return np.array([self.bin_centers(axis=i) for i in range(self.dimensions)])
axis = self.get_axis_number(axis)
return 0.5 * (self.bin_edges[axis][1:] + self.bin_edges[axis][:-1])
def get_axis_bin_index(self, value, axis):
"""Returns index along axis of bin in histogram which contains value
Inclusive on both endpoints
"""
axis = self.get_axis_number(axis)
bin_edges = self.bin_edges[axis]
# The right bin edge of np.histogram is inclusive:
if value == bin_edges[-1]:
# Minus two: one for bin edges rather than centers, one for 0-based indexing
return len(bin_edges) - 2
# For all other bins, it is exclusive.
result = np.searchsorted(bin_edges, [value], side='right')[0] - 1
if not 0 <= result <= len(bin_edges) - 1:
raise CoordinateOutOfRangeException("Value %s is not in range (%s-%s) of axis %s" % (
value, bin_edges[0], bin_edges[-1], axis))
return result
def get_bin_indices(self, values):
"""Returns index tuple in histogram of bin which contains value"""
return tuple([self.get_axis_bin_index(values[ax_i], ax_i)
for ax_i in range(self.dimensions)])
def all_axis_bin_centers(self, axis):
"""Return ndarray of same shape as histogram containing bin center value along axis at each point"""
# Arcane hack that seems to work, at least in 3d... hope
axis = self.get_axis_number(axis)
return np.meshgrid(*self.bin_centers(), indexing='ij')[axis]
##
# Data reduction: sum, slice, project, ...
##
def sum(self, axis):
"""Sums all data along axis, returns d-1 dimensional histogram"""
axis = self.get_axis_number(axis)
if self.dimensions == 2:
new_hist = Hist1d
else:
new_hist = Histdd
return new_hist.from_histogram(np.sum(self.histogram, axis=axis),
bin_edges=itemgetter(*self.other_axes(axis))(self.bin_edges),
axis_names=self.axis_names_without(axis))
def slice(self, start, stop=None, axis=0):
"""Restrict histogram to bins whose data values (not bin numbers) along axis are between start and stop
(both inclusive). Returns d dimensional histogram."""
if stop is None:
# Make a 1=bin slice
stop = start
axis = self.get_axis_number(axis)
start_bin = max(0, self.get_axis_bin_index(start, axis))
stop_bin = min(len(self.bin_centers(axis)) - 1, # TODO: test off by one!
self.get_axis_bin_index(stop, axis))
new_bin_edges = self.bin_edges.copy()
new_bin_edges[axis] = new_bin_edges[axis][start_bin:stop_bin + 2] # TODO: Test off by one here!
return Histdd.from_histogram(np.take(self.histogram, np.arange(start_bin, stop_bin + 1), axis=axis),
bin_edges=new_bin_edges, axis_names=self.axis_names)
def slicesum(self, start, stop=None, axis=0):
"""Slices the histogram along axis, then sums over that slice, returning a d-1 dimensional histogram"""
return self.slice(start, stop, axis).sum(axis)
def projection(self, axis):
"""Sums all data along all other axes, then return Hist1D"""
axis = self.get_axis_number(axis)
projected_hist = np.sum(self.histogram, axis=self.other_axes(axis))
return Hist1d.from_histogram(projected_hist, bin_edges=self.bin_edges[axis])
##
# Density methods: cumulate, normalize, ...
##
def cumulate(self, axis):
"""Returns new histogram with all data cumulated along axis."""
axis = self.get_axis_number(axis)
return Histdd.from_histogram(np.cumsum(self.histogram, axis=axis),
bin_edges=self.bin_edges,
axis_names=self.axis_names)
def _simsalabim_slice(self, axis):
return [slice(None) if i != axis else np.newaxis
for i in range(self.dimensions)]
def normalize(self, axis):
"""Returns new histogram where all values along axis (in one bin of the other axes) sum to 1"""
axis = self.get_axis_number(axis)
sum_along_axis = np.sum(self.histogram, axis=axis)
# Don't do anything for subspaces without any entries -- this avoids nans everywhere
sum_along_axis[sum_along_axis == 0] = 1
hist = self.histogram / sum_along_axis[tuple(self._simsalabim_slice(axis))]
return Histdd.from_histogram(hist,
bin_edges=self.bin_edges,
axis_names=self.axis_names)
def cumulative_density(self, axis):
"""Returns new histogram with all values replaced by their cumulative densities along axis."""
return self.normalize(axis).cumulate(axis)
def central_likelihood(self, axis):
"""Returns new histogram with all values replaced by their central likelihoods along axis."""
result = self.cumulative_density(axis)
result.histogram = 1 - 2 * np.abs(result.histogram - 0.5)
return result
##
# Mixed methods: both reduce and summarize the data
##
def percentile(self, percentile, axis, inclusive=True):
"""Returns d-1 dimensional histogram containing percentile of values along axis
if inclusive=True, will report bin center of first bin for which percentile% of data lies in or below the bin
=False, ... data lies strictly below the bin
10% percentile is calculated as: value at least 10% data is LOWER than
"""
axis = self.get_axis_number(axis)
# Shape of histogram
s = self.histogram.shape
# Shape of histogram after axis has been collapsed to 1
s_collapsed = list(s)
s_collapsed[axis] = 1
# Shape of histogram with axis removed entirely
s_removed = np.concatenate([s[:axis], s[axis + 1:]]).astype(np.int)
# Using np.where here is too tricky, as it may not return a value for each "bin-columns"
# First, get an array which has a minimum at the percentile-containing bins
# The minimum may not be unique: if later bins are empty, they will not be
if inclusive:
ecdf = self.cumulative_density(axis).histogram
else:
density = self.normalize(axis).histogram
ecdf = ecdf - density
ecdf = np.nan_to_num(ecdf) # Since we're relying on self-equality later
x = ecdf - 2 * (ecdf >= percentile / 100)
# We now want to get the location of the minimum
# To ensure it is unique, add a very very very small monotonously increasing bit to x
# Nobody will want 1e-9th percentiles, right? TODO
sz = np.ones(len(s), dtype=np.int)
sz[axis] = -1
x += np.linspace(0, 1e-9, s[axis]).reshape(sz)
# 1. Find the minimum along the axis
# 2. Reshape to s_collapsed and perform == to get a mask
# 3. Apply the mask to the bin centers along axis
# 4. Unflatten with reshape
result = self.all_axis_bin_centers(axis)[
x == np.min(x, axis=axis).reshape(s_collapsed)
]
result = result.reshape(s_removed)
if self.dimensions == 2:
new_hist = Hist1d
else:
new_hist = Histdd
return new_hist.from_histogram(histogram=result,
bin_edges=itemgetter(*self.other_axes(axis))(self.bin_edges),
axis_names=self.axis_names_without(axis))
def average(self, axis):
"""Returns d-1 dimensional histogram of (estimated) mean value of axis
NB this is very different from averaging over the axis!!!
"""
axis = self.get_axis_number(axis)
avg_hist = np.ma.average(self.all_axis_bin_centers(axis),
weights=self.histogram, axis=axis)
if self.dimensions == 2:
new_hist = Hist1d
else:
new_hist = Histdd
return new_hist.from_histogram(histogram=avg_hist,
bin_edges=itemgetter(*self.other_axes(axis))(self.bin_edges),
axis_names=self.axis_names_without(axis))
def std(self, axis):
"""Returns d-1 dimensional histogram of (estimated) std value along axis
NB this is very different from just std of the histogram values (which describe bin counts)
"""
def weighted_std(values, weights, axis):
# Stolen from http://stackoverflow.com/questions/2413522
average = np.average(values, weights=weights, axis=axis)
average = average[self._simsalabim_slice(axis)]
variance = np.average((values-average)**2, weights=weights, axis=axis)
return np.sqrt(variance)
axis = self.get_axis_number(axis)
std_hist = weighted_std(self.all_axis_bin_centers(axis),
weights=self.histogram, axis=axis)
if self.dimensions == 2:
new_hist = Hist1d
else:
new_hist = Histdd
return new_hist.from_histogram(histogram=std_hist,
bin_edges=itemgetter(*self.other_axes(axis))(self.bin_edges),
axis_names=self.axis_names_without(axis))
##
# Other stuff
##
def bin_volumes(self):
return reduce(np.multiply, np.ix_(*[np.diff(bs) for bs in self.bin_edges]))
def rebin(self, *factors, **kwargs):
"""Return a new histogram that is 'rebinned' (zoomed) by factors (tuple of floats) along each dimensions
factors: tuple with zoom factors along each axis. e.g. 2 = double number of bins, 0.5 = halve them.
order: Order for spline interpolation in scipy.ndimage.zoom. Defaults to linear interpolation (order=1).
The only accepted keyword argument is 'order'!!! (python 2 is not nice)
The normalization is set to the normalization of the current histogram
The factors don't have to be integers or fractions: scipy.ndimage.zoom deals with the rebinning arcana.
"""
if not HAVE_SCIPY:
raise NotImplementedError("Rebinning requires scipy.ndimage")
if any([x != 'order' for x in kwargs.keys()]):
raise ValueError("Only 'order' keyword argument is accepted. Yeah, this is confusing.. blame python 2.")
order = kwargs.get('order', 1)
# Construct a new histogram
mh = self.similar_blank_histogram()
if not len(factors) == self.dimensions:
raise ValueError("You must pass %d rebin factors to rebin a %d-dimensional histogram" % (
self.dimensions, self.dimensions
))
# Zoom the bin edges.
# It's a bit tricky for non-uniform bins:
# we first construct a linear interpolator to take
# fraction along axis -> axis coordinate according to current binning.
# Then we feed it the new desired binning fractions.
for i, f in enumerate(factors):
x = self.bin_edges[i]
mh.bin_edges[i] = np.interp(
x=np.linspace(0, 1, (len(x) - 1) * f + 1),
xp=np.linspace(0, 1, len(x)),
fp=x)
# Rebin the histogram using ndimage.zoom, then renormalize
mh.histogram = zoom(self.histogram, factors, order=order)
mh.histogram *= self.histogram.sum() / mh.histogram.sum()
# mh.histogram /= np.product(factors)
return mh
def get_random(self, size=10):
"""Returns (size, n_dim) array of random variates from the histogram.
Inside the bins, a uniform distribution is assumed
Note this assumes the histogram is an 'events per bin', not a pdf.
TODO: test more.
"""
# Sample random bin centers
bin_centers_ravel = np.array(np.meshgrid(*self.bin_centers(),
indexing='ij')).reshape(self.dimensions, -1).T
hist_ravel = self.histogram.ravel()
hist_ravel = hist_ravel.astype(np.float) / np.nansum(hist_ravel)
result = bin_centers_ravel[np.random.choice(len(bin_centers_ravel),
p=hist_ravel,
size=size)]
# Randomize the position inside the bin
for dim_i in range(self.dimensions):
bin_edges = self.bin_edges[dim_i]
bin_widths = np.diff(bin_edges)
# Note the - 1: for the first bin's bin center, searchsorted gives 1, but we want 0 here:
index_of_bin = np.searchsorted(bin_edges, result[:, dim_i]) - 1
result[:, dim_i] += (np.random.rand(size) - 0.5) * bin_widths[index_of_bin]
return result
def lookup(self, *coordinate_arrays):
"""Lookup values at specific points.
coordinate_arrays: numpy arrays of coordinates, one for each dimension
e.g. lookup(np.array([0, 2]), np.array([1, 3])) looks up (x=0, y=1) and (x=2, y3).
Clips if out of range!! TODO: option to throw exception instead.
TODO: Needs tests!!
TODO: port to Hist1d... or finally join the classes
TODO: Support for scalar arguments
"""
assert len(coordinate_arrays) == self.dimensions
# Convert each coordinate array to an index array
index_arrays = [np.clip(np.searchsorted(self.bin_edges[i], coordinate_arrays[i]) - 1,
0,
len(self.bin_edges[i]) - 2)
for i in range(self.dimensions)]
# Use the index arrays to slice the histogram
return self.histogram[tuple(index_arrays)]
# Check against slow version:
# def hist_to_interpolator_slow(mh):
# bin_centers_ravel = np.array(np.meshgrid(*mh.bin_centers(), indexing='ij')).reshape(2, -1).T
# return NearestNDInterpolator(bin_centers_ravel, mh.histogram.ravel())
# x = np.random.uniform(0, 400, 100)
# y = np.random.uniform(0, 200, 100)
# hist_to_interpolator(mh)(x, y) - hist_to_interpolator_slow(mh)(x, y)
def lookup_hist(self, mh):
"""Return histogram within binning of Histdd mh, with values looked up in this histogram.
This is not rebinning: no interpolation /renormalization is performed.
It's just a lookup.
"""
result = mh.similar_blank_histogram()
points = np.stack([mh.all_axis_bin_centers(i)
for i in range(mh.dimensions)]).reshape(mh.dimensions, -1)
values = self.lookup(*points)
result.histogram = values.reshape(result.histogram.shape)
return result
def plot(self, log_scale=False, log_scale_vmin=1,
colorbar=True,
cblabel='Number of entries',
colorbar_kwargs=None,
plt=plt,
**kwargs):
if colorbar_kwargs is None:
colorbar_kwargs = dict()
colorbar_kwargs['label'] = cblabel
if not CAN_PLOT:
raise ValueError("matplotlib did not import, so can't plot your histogram...")
if self.dimensions == 1:
return Hist1d.from_histogram(self.histogram, self.bin_edges[0]).plot(**kwargs)
elif self.dimensions == 2:
if log_scale:
kwargs.setdefault('norm', matplotlib.colors.LogNorm(vmin=max(log_scale_vmin, self.histogram.min()),
vmax=self.histogram.max()))
mesh = plt.pcolormesh(self.bin_edges[0], self.bin_edges[1], self.histogram.T, **kwargs)
plt.xlim(np.min(self.bin_edges[0]), np.max(self.bin_edges[0]))
plt.ylim(np.min(self.bin_edges[1]), np.max(self.bin_edges[1]))
if self.axis_names:
plt.xlabel(self.axis_names[0])
plt.ylabel(self.axis_names[1])
if colorbar:
cb = plt.colorbar(**colorbar_kwargs)
cb.ax.minorticks_on()
return mesh, cb
return mesh
else:
raise ValueError("Can only plot 1- or 2-dimensional histograms!")
Histdd.project = Histdd.projection
if __name__ == '__main__':
# Create histograms just like from numpy...
m = Hist1d([0, 3, 1, 6, 2, 9], bins=3)
# ...or add data incrementally:
m = Hist1d(bins=100, range=(-3, 4))
m.add(np.random.normal(0, 0.5, 10**4))
m.add(np.random.normal(2, 0.2, 10**3))
# Get the data back out:
print(m.histogram, m.bin_edges)
if CAN_PLOT:
# Access derived quantities like bin_centers, normalized_histogram, density, cumulative_density, mean, std
plt.plot(m.bin_centers, m.normalized_histogram, label="Normalized histogram", linestyle='steps')
plt.plot(m.bin_centers, m.density, label="Empirical PDF", linestyle='steps')
plt.plot(m.bin_centers, m.cumulative_density, label="Empirical CDF", linestyle='steps')
plt.title("Estimated mean %0.2f, estimated std %0.2f" % (m.mean, m.std))
plt.legend(loc='best')
plt.show()
# Slicing and arithmetic behave just like ordinary ndarrays
print("The fourth bin has %d entries" % m[3])
m[1:4] += 4 + 2 * m[-27:-24]
print("Now it has %d entries" % m[3])
# Of course I couldn't resist adding a canned plotting function:
if CAN_PLOT:
m.plot()
plt.show()
# Create and show a 2d histogram. Axis names are optional.
m2 = Histdd(bins=100, range=[[-5, 3], [-3, 5]], axis_names=['x', 'y'])
m2.add(np.random.normal(1, 1, 10**6), np.random.normal(1, 1, 10**6))
m2.add(np.random.normal(-2, 1, 10**6), np.random.normal(2, 1, 10**6))
# x and y projections return Hist1d objects
m2.projection('x').plot(label='x projection')
m2.projection(1).plot(label='y projection')
plt.legend()
plt.show()