Proper Numpy solution for Categorize (bug-fix and also 10X speedup).

jpivarski · jpivarski · commit 1c6d548fd6b5 · 2016-11-29T11:39:14.000-06:00
diff --git a/histogrammar/primitives/categorize.py b/histogrammar/primitives/categorize.py
@@ -237,16 +237,27 @@ def _numpy(self, data, weights, shape):
         self._checkNPQuantity(q, shape)
         self._checkNPWeights(weights, shape)
         weights = self._makeNPWeights(weights, shape)
+        newentries = weights.sum()
+        
+        subweights = weights.copy()
+        subweights[weights < 0.0] = 0.0
 
-        # no possibility of exception from here on out (for rollback)
-        for x, w in zip(q, weights):
-            if w > 0.0:
-                if x not in self.bins:
-                    self.bins[x] = self.value.zero()
-                self.bins[x].fill(x, w)
+        import numpy
+        selection = numpy.empty(q.shape, dtype=numpy.bool)
+
+        uniques, inverse = numpy.unique(q, return_inverse=True)
 
         # no possibility of exception from here on out (for rollback)
-        self.entries += float(weights.sum())
+        for i, x in enumerate(uniques):
+            if x not in self.bins:
+                self.bins[x] = self.value.zero()
+            
+            numpy.not_equal(inverse, i, selection)
+            subweights[:] = weights
+            subweights[selection] = 0.0
+            self.bins[x]._numpy(data, subweights, shape)
+
+        self.entries += float(newentries)
 
     def _sparksql(self, jvm, converter):
         return converter.Categorize(self.quantity.asSparkSQL(), self.value._sparksql(jvm, converter))