Merge pull request #16 from histogrammar/1.0.x

jpivarski · web-flow · commit af449515d851 · 2017-03-21T09:48:34.000-05:00
Proper Numpy solution for Categorize (bug-fix and also 10X speedup).
diff --git a/docs/conf.py b/docs/conf.py
@@ -47,9 +47,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = "1.0.5"
+version = "1.0.6"
 # The full version, including alpha/beta/rc tags.
-release = "1.0.5"
+release = "1.0.6"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/histogrammar/defs.py b/histogrammar/defs.py
@@ -107,7 +107,7 @@ def fromJsonFragment(json, nameFromParent):
 
     @staticmethod
     def fromJsonFile(fileName):
-        return Factory.fromJson(jsonlib.load(open(fileName), json))
+        return Factory.fromJson(jsonlib.load(open(fileName)))
 
     @staticmethod
     def fromJsonString(json):
diff --git a/histogrammar/primitives/categorize.py b/histogrammar/primitives/categorize.py
@@ -237,16 +237,27 @@ def _numpy(self, data, weights, shape):
         self._checkNPQuantity(q, shape)
         self._checkNPWeights(weights, shape)
         weights = self._makeNPWeights(weights, shape)
+        newentries = weights.sum()
+        
+        subweights = weights.copy()
+        subweights[weights < 0.0] = 0.0
 
-        # no possibility of exception from here on out (for rollback)
-        for x, w in zip(q, weights):
-            if w > 0.0:
-                if x not in self.bins:
-                    self.bins[x] = self.value.zero()
-                self.bins[x].fill(x, w)
+        import numpy
+        selection = numpy.empty(q.shape, dtype=numpy.bool)
+
+        uniques, inverse = numpy.unique(q, return_inverse=True)
 
         # no possibility of exception from here on out (for rollback)
-        self.entries += float(weights.sum())
+        for i, x in enumerate(uniques):
+            if x not in self.bins:
+                self.bins[x] = self.value.zero()
+            
+            numpy.not_equal(inverse, i, selection)
+            subweights[:] = weights
+            subweights[selection] = 0.0
+            self.bins[x]._numpy(data, subweights, shape)
+
+        self.entries += float(newentries)
 
     def _sparksql(self, jvm, converter):
         return converter.Categorize(self.quantity.asSparkSQL(), self.value._sparksql(jvm, converter))
diff --git a/histogrammar/version.py b/histogrammar/version.py
@@ -16,7 +16,7 @@
 
 import re
 
-__version__ = "1.0.5"
+__version__ = "1.0.6"
 
 version = __version__