Skip to content

Commit 66d704b

Browse files
authored
Minor YOLO sample edits to speed up CPU post-processing (#200)
1 parent 57d4cec commit 66d704b

File tree

2 files changed

+70
-50
lines changed

2 files changed

+70
-50
lines changed

Samples/yolov4/yolov4.cpp

Lines changed: 45 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -339,62 +339,76 @@ void Sample::GetModelPredictions(
339339
// values total.
340340
assert(anchors.size() == 6);
341341

342-
std::vector<float> tensorData = CopyReadbackHeap<float>(modelOutput.readback.Get());
343-
TensorView<float> predTensor(tensorData, NchwExtents(modelOutput.desc.sizes));
342+
// DirectML writes the final output data in NHWC, where the C channel contains the bounding box & probabilities
343+
// for each prediction.
344+
const uint32_t predTensorN = modelOutput.desc.sizes[0];
345+
const uint32_t predTensorH = modelOutput.desc.sizes[1];
346+
const uint32_t predTensorW = modelOutput.desc.sizes[2];
347+
const uint32_t predTensorC = modelOutput.desc.sizes[3];
344348

345349
// YoloV4 predicts 3 boxes per scale, so we expect 3 separate predictions here
346-
assert(predTensor.Sizes().n == 3);
347-
348-
// Channel should contain the bounding box x/y/w/h, a confidence score, followed by probabilities for each class
349-
assert(predTensor.Sizes().c == 5 + YoloV4Constants::c_numClasses);
350+
assert(predTensorN == 3);
351+
352+
// Width should contain the bounding box x/y/w/h, a confidence score, the probability for max class, and the class index
353+
assert(predTensorC == 7);
354+
355+
struct PotentialPrediction
356+
{
357+
float bx;
358+
float by;
359+
float bw;
360+
float bh;
361+
float confidence;
362+
float classMaxProbability;
363+
uint32_t classIndex;
364+
};
350365

351-
for (uint32_t n = 0; n < predTensor.Sizes().n; ++n)
366+
// The output tensor should be large enough to hold the expected number of predictions.
367+
assert(predTensorN * predTensorH * predTensorW * sizeof(PotentialPrediction) <= modelOutput.desc.totalTensorSizeInBytes);
368+
std::vector<PotentialPrediction> tensorData = CopyReadbackHeap<PotentialPrediction>(modelOutput.readback.Get());
369+
370+
// Scale the boxes to be relative to the original image size
371+
auto viewport = m_deviceResources->GetScreenViewport();
372+
float xScale = (float)viewport.Width / YoloV4Constants::c_inputWidth;
373+
float yScale = (float)viewport.Height / YoloV4Constants::c_inputHeight;
374+
375+
uint32_t currentPredIndex = 0;
376+
for (uint32_t n = 0; n < predTensorN; ++n)
352377
{
353-
for (uint32_t h = 0; h < predTensor.Sizes().h; ++h)
378+
for (uint32_t h = 0; h < predTensorH; ++h)
354379
{
355-
for (uint32_t w = 0; w < predTensor.Sizes().w; ++w)
380+
for (uint32_t w = 0; w < predTensorW; ++w)
356381
{
357-
float bx = predTensor(n, 0, h, w);
358-
float by = predTensor(n, 1, h, w);
359-
float bw = predTensor(n, 2, h, w);
360-
float bh = predTensor(n, 3, h, w);
361-
float confidence = predTensor(n, 4, h, w);
362-
363-
// Copy the probabilities for each class
364-
std::vector<float> probabilities;
365-
probabilities.reserve(YoloV4Constants::c_numClasses);
366-
for (uint32_t i = 5; i < predTensor.Sizes().c; ++i)
382+
const PotentialPrediction& currentPred = tensorData[currentPredIndex++];
383+
384+
// Discard boxes with low scores
385+
float score = currentPred.confidence * currentPred.classMaxProbability;
386+
if (score < YoloV4Constants::c_scoreThreshold)
367387
{
368-
probabilities.push_back(predTensor(n, i, h, w));
388+
continue;
369389
}
370390

371391
// We need to do some postprocessing on the raw values before we return them
372392

373393
// Apply xyScale. Need to apply offsets of half a grid cell here, to ensure the scaling is
374394
// centered around zero.
375-
bx = xyScale * (bx - 0.5f) + 0.5f;
376-
by = xyScale * (by - 0.5f) + 0.5f;
395+
float bx = xyScale * (currentPred.bx - 0.5f) + 0.5f;
396+
float by = xyScale * (currentPred.by - 0.5f) + 0.5f;
377397

378398
// Transform the x/y from being relative to the grid cell, to being relative to the whole image
379399
bx = (bx + (float)w) * stride;
380400
by = (by + (float)h) * stride;
381401

382402
// Scale the w/h by the supplied anchors
383-
bw *= anchors[n * 2];
384-
bh *= anchors[n * 2 + 1];
403+
float bw = currentPred.bw * anchors[n * 2];
404+
float bh = currentPred.bh * anchors[n * 2 + 1];
385405

386406
// Convert x,y,w,h to xmin,ymin,xmax,ymax
387407
float xmin = bx - bw / 2;
388408
float ymin = by - bh / 2;
389409
float xmax = bx + bw / 2;
390410
float ymax = by + bh / 2;
391411

392-
auto viewport = m_deviceResources->GetScreenViewport();
393-
394-
// Scale the boxes to be relative to the original image size
395-
float xScale = (float)viewport.Width / YoloV4Constants::c_inputWidth;
396-
float yScale = (float)viewport.Height / YoloV4Constants::c_inputHeight;
397-
398412
xmin *= xScale;
399413
ymin *= yScale;
400414
xmax *= xScale;
@@ -412,22 +426,13 @@ void Sample::GetModelPredictions(
412426
continue;
413427
}
414428

415-
// Discard boxes with low scores
416-
ptrdiff_t classIndex = std::max_element(probabilities.begin(), probabilities.end()) - probabilities.begin();
417-
float probability = probabilities[classIndex];
418-
float score = confidence * probability;
419-
if (score < YoloV4Constants::c_scoreThreshold)
420-
{
421-
continue;
422-
}
423-
424429
Prediction pred = {};
425430
pred.xmin = xmin;
426431
pred.ymin = ymin;
427432
pred.xmax = xmax;
428433
pred.ymax = ymax;
429434
pred.score = score;
430-
pred.predictedClass = static_cast<uint32_t>(classIndex);
435+
pred.predictedClass = currentPred.classIndex;
431436
out->push_back(pred);
432437
}
433438
}

Samples/yolov4/yolov4ResourceBuilder.cpp

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ class YoloV4
276276
}
277277
};
278278

279-
// Takes a tensor of size [1, 3 * (5 + numClasses), H, W] and returns a tensor of size [3, 5 + numClasses, H, W].
279+
// Takes a tensor of size [1, 3 * (5 + numClasses), H, W] and returns a tensor of size [3, H, W, 7].
280280
// Sigmoid activation is applied to all channels that represent probabilities (which are not all of them).
281281
dml::Expression DecodeModelOutput(dml::Expression output, uint32_t numClasses)
282282
{
@@ -294,23 +294,38 @@ dml::Expression DecodeModelOutput(dml::Expression output, uint32_t numClasses)
294294
// Since this doesn't transform the data any, this can be accomplished with a simple reinterpret.
295295
output = dml::Reinterpret(output, { 3, numClasses + 5, outputSizes[2], outputSizes[3] }, dml::NullOpt);
296296

297-
// Split the new channel (of size 5+numClasses) into 4 different tensors with channels of 2, 2, 1+numClasses.
298-
// These represent the box xy, box wh, confidence+probabilities for each class.
299-
std::vector<dml::Expression> split = dml::Split(output, 1, { 2, 2, 1 + numClasses });
300-
assert(split.size() == 3);
297+
// Split the new channel (of size 5+numClasses) into 4 different tensors with channels of 2, 2, 1, numClasses.
298+
// These represent the box xy, box wh, confidence, and probabilities for each class.
299+
const uint32_t channelDim = 1;
300+
std::vector<dml::Expression> split = dml::Split(output, channelDim, { 2, 2, 1, numClasses });
301+
assert(split.size() == 4);
301302

302303
// Convenience
303304
auto convXy = split[0];
304305
auto convWh = split[1];
305-
auto convConfProb = split[2];
306+
auto convConf = split[2];
307+
auto convProb = split[3];
306308

307309
// Apply final activations
308310
convXy = dml::ActivationSigmoid(convXy);
309311
convWh = dml::Exp(convWh);
310-
convConfProb = dml::ActivationSigmoid(convConfProb);
311-
312-
const uint32_t joinAxis = 1; // Join along channel
313-
return dml::Join({ convXy, convWh, convConfProb }, joinAxis);
312+
convConf = dml::ActivationSigmoid(convConf);
313+
convProb = dml::ActivationSigmoid(convProb);
314+
315+
// Compute the max and argmax of the probabilities. The argmax outputs UINT32 indices which
316+
// are reinterpreted as float so they can be joined into the same output tensor.
317+
auto convProbMax = dml::Reduce(convProb, DML_REDUCE_FUNCTION_MAX, { channelDim });
318+
auto convProbArgMax = dml::Reduce(convProb, DML_REDUCE_FUNCTION_ARGMAX, { channelDim });
319+
convProbArgMax = dml::Reinterpret(convProbArgMax, DML_TENSOR_DATA_TYPE_FLOAT32);
320+
321+
// Join the tensors along channel dimension.
322+
auto joined = dml::Join({ convXy, convWh, convConf, convProbMax, convProbArgMax }, channelDim);
323+
324+
// Transpose from NCHW to NHWC for faster reading on the CPU (converts output from SoA to AoS).
325+
dml::TensorDimensions sizesNchw = joined.GetOutputDesc().sizes;
326+
dml::TensorDimensions sizesNhwc = { sizesNchw[0], sizesNchw[3], sizesNchw[2], sizesNchw[1] };
327+
dml::TensorStrides stridesNhwc = { sizesNchw[1] * sizesNchw[2] * sizesNchw[3], sizesNchw[3], 1, sizesNchw[2] * sizesNchw[3] };
328+
return dml::Identity(dml::Reinterpret(joined, sizesNhwc, stridesNhwc));
314329
}
315330

316331
void Sample::CreateDirectMLResources()

0 commit comments

Comments
 (0)