Minor YOLO sample edits to speed up CPU post-processing (#200)

jstoecker · web-flow · commit 66d704bdf72e · 2022-02-04T19:10:57.000-08:00
diff --git a/Samples/yolov4/yolov4.cpp b/Samples/yolov4/yolov4.cpp
@@ -339,62 +339,76 @@ void Sample::GetModelPredictions(
     // values total.
     assert(anchors.size() == 6);
 
-    std::vector<float> tensorData = CopyReadbackHeap<float>(modelOutput.readback.Get());
-    TensorView<float> predTensor(tensorData, NchwExtents(modelOutput.desc.sizes));
+    // DirectML writes the final output data in NHWC, where the C channel contains the bounding box & probabilities 
+    // for each prediction.
+    const uint32_t predTensorN = modelOutput.desc.sizes[0];
+    const uint32_t predTensorH = modelOutput.desc.sizes[1];
+    const uint32_t predTensorW = modelOutput.desc.sizes[2];
+    const uint32_t predTensorC = modelOutput.desc.sizes[3];
 
     // YoloV4 predicts 3 boxes per scale, so we expect 3 separate predictions here
-    assert(predTensor.Sizes().n == 3);
-    
-    // Channel should contain the bounding box x/y/w/h, a confidence score, followed by probabilities for each class
-    assert(predTensor.Sizes().c == 5 + YoloV4Constants::c_numClasses);
+    assert(predTensorN == 3);
+
+    // Width should contain the bounding box x/y/w/h, a confidence score, the probability for max class, and the class index
+    assert(predTensorC == 7);
+
+    struct PotentialPrediction
+    {
+        float bx;
+        float by;
+        float bw;
+        float bh;
+        float confidence;
+        float classMaxProbability;
+        uint32_t classIndex;
+    };
 
-    for (uint32_t n = 0; n < predTensor.Sizes().n; ++n)
+    // The output tensor should be large enough to hold the expected number of predictions.
+    assert(predTensorN * predTensorH * predTensorW * sizeof(PotentialPrediction) <= modelOutput.desc.totalTensorSizeInBytes);
+    std::vector<PotentialPrediction> tensorData = CopyReadbackHeap<PotentialPrediction>(modelOutput.readback.Get());
+
+    // Scale the boxes to be relative to the original image size
+    auto viewport = m_deviceResources->GetScreenViewport();
+    float xScale = (float)viewport.Width / YoloV4Constants::c_inputWidth;
+    float yScale = (float)viewport.Height / YoloV4Constants::c_inputHeight;
+
+    uint32_t currentPredIndex = 0;
+    for (uint32_t n = 0; n < predTensorN; ++n)
     {
-        for (uint32_t h = 0; h < predTensor.Sizes().h; ++h)
+        for (uint32_t h = 0; h < predTensorH; ++h)
         {
-            for (uint32_t w = 0; w < predTensor.Sizes().w; ++w)
+            for (uint32_t w = 0; w < predTensorW; ++w)
             {
-                float bx = predTensor(n, 0, h, w);
-                float by = predTensor(n, 1, h, w);
-                float bw = predTensor(n, 2, h, w);
-                float bh = predTensor(n, 3, h, w);
-                float confidence = predTensor(n, 4, h, w);
-
-                // Copy the probabilities for each class
-                std::vector<float> probabilities;
-                probabilities.reserve(YoloV4Constants::c_numClasses);
-                for (uint32_t i = 5; i < predTensor.Sizes().c; ++i)
+                const PotentialPrediction& currentPred = tensorData[currentPredIndex++];
+
+                // Discard boxes with low scores
+                float score = currentPred.confidence * currentPred.classMaxProbability;
+                if (score < YoloV4Constants::c_scoreThreshold)
                 {
-                    probabilities.push_back(predTensor(n, i, h, w));
+                    continue;
                 }
 
                 // We need to do some postprocessing on the raw values before we return them
 
                 // Apply xyScale. Need to apply offsets of half a grid cell here, to ensure the scaling is
                 // centered around zero.
-                bx = xyScale * (bx - 0.5f) + 0.5f;
-                by = xyScale * (by - 0.5f) + 0.5f;
+                float bx = xyScale * (currentPred.bx - 0.5f) + 0.5f;
+                float by = xyScale * (currentPred.by - 0.5f) + 0.5f;
 
                 // Transform the x/y from being relative to the grid cell, to being relative to the whole image
                 bx = (bx + (float)w) * stride;
                 by = (by + (float)h) * stride;
 
                 // Scale the w/h by the supplied anchors
-                bw *= anchors[n * 2];
-                bh *= anchors[n * 2 + 1];
+                float bw = currentPred.bw * anchors[n * 2];
+                float bh = currentPred.bh * anchors[n * 2 + 1];
 
                 // Convert x,y,w,h to xmin,ymin,xmax,ymax
                 float xmin = bx - bw / 2;
                 float ymin = by - bh / 2;
                 float xmax = bx + bw / 2;
                 float ymax = by + bh / 2;
 
-                auto viewport = m_deviceResources->GetScreenViewport();
-
-                // Scale the boxes to be relative to the original image size
-                float xScale = (float)viewport.Width / YoloV4Constants::c_inputWidth;
-                float yScale = (float)viewport.Height / YoloV4Constants::c_inputHeight;
-
                 xmin *= xScale;
                 ymin *= yScale;
                 xmax *= xScale;
@@ -412,22 +426,13 @@ void Sample::GetModelPredictions(
                     continue;
                 }
 
-                // Discard boxes with low scores
-                ptrdiff_t classIndex = std::max_element(probabilities.begin(), probabilities.end()) - probabilities.begin();
-                float probability = probabilities[classIndex];
-                float score = confidence * probability;
-                if (score < YoloV4Constants::c_scoreThreshold)
-                {
-                    continue;
-                }
-
                 Prediction pred = {};
                 pred.xmin = xmin;
                 pred.ymin = ymin;
                 pred.xmax = xmax;
                 pred.ymax = ymax;
                 pred.score = score;
-                pred.predictedClass = static_cast<uint32_t>(classIndex);
+                pred.predictedClass = currentPred.classIndex;
                 out->push_back(pred);
             }
         }
diff --git a/Samples/yolov4/yolov4ResourceBuilder.cpp b/Samples/yolov4/yolov4ResourceBuilder.cpp
@@ -276,7 +276,7 @@ class YoloV4
     }
 };
 
-// Takes a tensor of size [1, 3 * (5 + numClasses), H, W] and returns a tensor of size [3, 5 + numClasses, H, W]. 
+// Takes a tensor of size [1, 3 * (5 + numClasses), H, W] and returns a tensor of size [3, H, W, 7]. 
 // Sigmoid activation is applied to all channels that represent probabilities (which are not all of them).
 dml::Expression DecodeModelOutput(dml::Expression output, uint32_t numClasses)
 {
@@ -294,23 +294,38 @@ dml::Expression DecodeModelOutput(dml::Expression output, uint32_t numClasses)
     // Since this doesn't transform the data any, this can be accomplished with a simple reinterpret.
     output = dml::Reinterpret(output, { 3, numClasses + 5, outputSizes[2], outputSizes[3] }, dml::NullOpt);
 
-    // Split the new channel (of size 5+numClasses) into 4 different tensors with channels of 2, 2, 1+numClasses.
-    // These represent the box xy, box wh, confidence+probabilities for each class.
-    std::vector<dml::Expression> split = dml::Split(output, 1, { 2, 2, 1 + numClasses });
-    assert(split.size() == 3);
+    // Split the new channel (of size 5+numClasses) into 4 different tensors with channels of 2, 2, 1, numClasses.
+    // These represent the box xy, box wh, confidence, and probabilities for each class.
+    const uint32_t channelDim = 1;
+    std::vector<dml::Expression> split = dml::Split(output, channelDim, { 2, 2, 1, numClasses });
+    assert(split.size() == 4);
 
     // Convenience
     auto convXy = split[0];
     auto convWh = split[1];
-    auto convConfProb = split[2];
+    auto convConf = split[2];
+    auto convProb = split[3];
 
     // Apply final activations
     convXy = dml::ActivationSigmoid(convXy);
     convWh = dml::Exp(convWh);
-    convConfProb = dml::ActivationSigmoid(convConfProb);
-
-    const uint32_t joinAxis = 1; // Join along channel
-    return dml::Join({ convXy, convWh, convConfProb }, joinAxis);
+    convConf = dml::ActivationSigmoid(convConf);
+    convProb = dml::ActivationSigmoid(convProb);
+
+    // Compute the max and argmax of the probabilities. The argmax outputs UINT32 indices which
+    // are reinterpreted as float so they can be joined into the same output tensor.
+    auto convProbMax = dml::Reduce(convProb, DML_REDUCE_FUNCTION_MAX, { channelDim });
+    auto convProbArgMax = dml::Reduce(convProb, DML_REDUCE_FUNCTION_ARGMAX, { channelDim });
+    convProbArgMax = dml::Reinterpret(convProbArgMax, DML_TENSOR_DATA_TYPE_FLOAT32);
+
+    // Join the tensors along channel dimension.
+    auto joined = dml::Join({ convXy, convWh, convConf, convProbMax, convProbArgMax }, channelDim);
+
+    // Transpose from NCHW to NHWC for faster reading on the CPU (converts output from SoA to AoS).
+    dml::TensorDimensions sizesNchw = joined.GetOutputDesc().sizes;
+    dml::TensorDimensions sizesNhwc = { sizesNchw[0], sizesNchw[3], sizesNchw[2], sizesNchw[1] };
+    dml::TensorStrides stridesNhwc = { sizesNchw[1] * sizesNchw[2] * sizesNchw[3], sizesNchw[3], 1, sizesNchw[2] * sizesNchw[3] };
+    return dml::Identity(dml::Reinterpret(joined, sizesNhwc, stridesNhwc));
 }
 
 void Sample::CreateDirectMLResources()