@@ -339,62 +339,76 @@ void Sample::GetModelPredictions(
339
339
// values total.
340
340
assert (anchors.size () == 6 );
341
341
342
- std::vector<float > tensorData = CopyReadbackHeap<float >(modelOutput.readback .Get ());
343
- TensorView<float > predTensor (tensorData, NchwExtents (modelOutput.desc .sizes ));
342
+ // DirectML writes the final output data in NHWC, where the C channel contains the bounding box & probabilities
343
+ // for each prediction.
344
+ const uint32_t predTensorN = modelOutput.desc .sizes [0 ];
345
+ const uint32_t predTensorH = modelOutput.desc .sizes [1 ];
346
+ const uint32_t predTensorW = modelOutput.desc .sizes [2 ];
347
+ const uint32_t predTensorC = modelOutput.desc .sizes [3 ];
344
348
345
349
// YoloV4 predicts 3 boxes per scale, so we expect 3 separate predictions here
346
- assert (predTensor.Sizes ().n == 3 );
347
-
348
- // Channel should contain the bounding box x/y/w/h, a confidence score, followed by probabilities for each class
349
- assert (predTensor.Sizes ().c == 5 + YoloV4Constants::c_numClasses);
350
+ assert (predTensorN == 3 );
351
+
352
+ // Width should contain the bounding box x/y/w/h, a confidence score, the probability for max class, and the class index
353
+ assert (predTensorC == 7 );
354
+
355
+ struct PotentialPrediction
356
+ {
357
+ float bx;
358
+ float by;
359
+ float bw;
360
+ float bh;
361
+ float confidence;
362
+ float classMaxProbability;
363
+ uint32_t classIndex;
364
+ };
350
365
351
- for (uint32_t n = 0 ; n < predTensor.Sizes ().n ; ++n)
366
+ // The output tensor should be large enough to hold the expected number of predictions.
367
+ assert (predTensorN * predTensorH * predTensorW * sizeof (PotentialPrediction) <= modelOutput.desc .totalTensorSizeInBytes );
368
+ std::vector<PotentialPrediction> tensorData = CopyReadbackHeap<PotentialPrediction>(modelOutput.readback .Get ());
369
+
370
+ // Scale the boxes to be relative to the original image size
371
+ auto viewport = m_deviceResources->GetScreenViewport ();
372
+ float xScale = (float )viewport.Width / YoloV4Constants::c_inputWidth;
373
+ float yScale = (float )viewport.Height / YoloV4Constants::c_inputHeight;
374
+
375
+ uint32_t currentPredIndex = 0 ;
376
+ for (uint32_t n = 0 ; n < predTensorN; ++n)
352
377
{
353
- for (uint32_t h = 0 ; h < predTensor. Sizes (). h ; ++h)
378
+ for (uint32_t h = 0 ; h < predTensorH ; ++h)
354
379
{
355
- for (uint32_t w = 0 ; w < predTensor. Sizes (). w ; ++w)
380
+ for (uint32_t w = 0 ; w < predTensorW ; ++w)
356
381
{
357
- float bx = predTensor (n, 0 , h, w);
358
- float by = predTensor (n, 1 , h, w);
359
- float bw = predTensor (n, 2 , h, w);
360
- float bh = predTensor (n, 3 , h, w);
361
- float confidence = predTensor (n, 4 , h, w);
362
-
363
- // Copy the probabilities for each class
364
- std::vector<float > probabilities;
365
- probabilities.reserve (YoloV4Constants::c_numClasses);
366
- for (uint32_t i = 5 ; i < predTensor.Sizes ().c ; ++i)
382
+ const PotentialPrediction& currentPred = tensorData[currentPredIndex++];
383
+
384
+ // Discard boxes with low scores
385
+ float score = currentPred.confidence * currentPred.classMaxProbability ;
386
+ if (score < YoloV4Constants::c_scoreThreshold)
367
387
{
368
- probabilities. push_back ( predTensor (n, i, h, w)) ;
388
+ continue ;
369
389
}
370
390
371
391
// We need to do some postprocessing on the raw values before we return them
372
392
373
393
// Apply xyScale. Need to apply offsets of half a grid cell here, to ensure the scaling is
374
394
// centered around zero.
375
- bx = xyScale * (bx - 0 .5f ) + 0 .5f ;
376
- by = xyScale * (by - 0 .5f ) + 0 .5f ;
395
+ float bx = xyScale * (currentPred. bx - 0 .5f ) + 0 .5f ;
396
+ float by = xyScale * (currentPred. by - 0 .5f ) + 0 .5f ;
377
397
378
398
// Transform the x/y from being relative to the grid cell, to being relative to the whole image
379
399
bx = (bx + (float )w) * stride;
380
400
by = (by + (float )h) * stride;
381
401
382
402
// Scale the w/h by the supplied anchors
383
- bw *= anchors[n * 2 ];
384
- bh *= anchors[n * 2 + 1 ];
403
+ float bw = currentPred. bw * anchors[n * 2 ];
404
+ float bh = currentPred. bh * anchors[n * 2 + 1 ];
385
405
386
406
// Convert x,y,w,h to xmin,ymin,xmax,ymax
387
407
float xmin = bx - bw / 2 ;
388
408
float ymin = by - bh / 2 ;
389
409
float xmax = bx + bw / 2 ;
390
410
float ymax = by + bh / 2 ;
391
411
392
- auto viewport = m_deviceResources->GetScreenViewport ();
393
-
394
- // Scale the boxes to be relative to the original image size
395
- float xScale = (float )viewport.Width / YoloV4Constants::c_inputWidth;
396
- float yScale = (float )viewport.Height / YoloV4Constants::c_inputHeight;
397
-
398
412
xmin *= xScale;
399
413
ymin *= yScale;
400
414
xmax *= xScale;
@@ -412,22 +426,13 @@ void Sample::GetModelPredictions(
412
426
continue ;
413
427
}
414
428
415
- // Discard boxes with low scores
416
- ptrdiff_t classIndex = std::max_element (probabilities.begin (), probabilities.end ()) - probabilities.begin ();
417
- float probability = probabilities[classIndex];
418
- float score = confidence * probability;
419
- if (score < YoloV4Constants::c_scoreThreshold)
420
- {
421
- continue ;
422
- }
423
-
424
429
Prediction pred = {};
425
430
pred.xmin = xmin;
426
431
pred.ymin = ymin;
427
432
pred.xmax = xmax;
428
433
pred.ymax = ymax;
429
434
pred.score = score;
430
- pred.predictedClass = static_cast < uint32_t >( classIndex) ;
435
+ pred.predictedClass = currentPred. classIndex ;
431
436
out->push_back (pred);
432
437
}
433
438
}
0 commit comments