Skip to content

Commit af44951

Browse files
authored
Merge pull request #16 from histogrammar/1.0.x
Proper Numpy solution for Categorize (bug-fix and also 10X speedup).
2 parents 3c2bc96 + 536d810 commit af44951

File tree

4 files changed

+22
-11
lines changed

4 files changed

+22
-11
lines changed

docs/conf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,9 @@
4747
# built documents.
4848
#
4949
# The short X.Y version.
50-
version = "1.0.5"
50+
version = "1.0.6"
5151
# The full version, including alpha/beta/rc tags.
52-
release = "1.0.5"
52+
release = "1.0.6"
5353

5454
# The language for content autogenerated by Sphinx. Refer to documentation
5555
# for a list of supported languages.

histogrammar/defs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def fromJsonFragment(json, nameFromParent):
107107

108108
@staticmethod
109109
def fromJsonFile(fileName):
110-
return Factory.fromJson(jsonlib.load(open(fileName), json))
110+
return Factory.fromJson(jsonlib.load(open(fileName)))
111111

112112
@staticmethod
113113
def fromJsonString(json):

histogrammar/primitives/categorize.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -237,16 +237,27 @@ def _numpy(self, data, weights, shape):
237237
self._checkNPQuantity(q, shape)
238238
self._checkNPWeights(weights, shape)
239239
weights = self._makeNPWeights(weights, shape)
240+
newentries = weights.sum()
241+
242+
subweights = weights.copy()
243+
subweights[weights < 0.0] = 0.0
240244

241-
# no possibility of exception from here on out (for rollback)
242-
for x, w in zip(q, weights):
243-
if w > 0.0:
244-
if x not in self.bins:
245-
self.bins[x] = self.value.zero()
246-
self.bins[x].fill(x, w)
245+
import numpy
246+
selection = numpy.empty(q.shape, dtype=numpy.bool)
247+
248+
uniques, inverse = numpy.unique(q, return_inverse=True)
247249

248250
# no possibility of exception from here on out (for rollback)
249-
self.entries += float(weights.sum())
251+
for i, x in enumerate(uniques):
252+
if x not in self.bins:
253+
self.bins[x] = self.value.zero()
254+
255+
numpy.not_equal(inverse, i, selection)
256+
subweights[:] = weights
257+
subweights[selection] = 0.0
258+
self.bins[x]._numpy(data, subweights, shape)
259+
260+
self.entries += float(newentries)
250261

251262
def _sparksql(self, jvm, converter):
252263
return converter.Categorize(self.quantity.asSparkSQL(), self.value._sparksql(jvm, converter))

histogrammar/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
import re
1818

19-
__version__ = "1.0.5"
19+
__version__ = "1.0.6"
2020

2121
version = __version__
2222

0 commit comments

Comments
 (0)