|
1 | | -# Benchmarks |
2 | | - |
3 | | -Below benchmarks done for comparison of rx6600xt and gtx960 - GPUs |
4 | | -of cuda and rocm backends vs `pytorch_ocl` |
5 | | - |
6 | | -Depending on the network training performance is around 60 to 90 percent |
7 | | -inference performance is somewhat better. |
8 | | - |
9 | | -Notes: time in ms per batch - smaller is better, input is standard imagenet |
10 | | -input Batchx3x224x224 |
11 | | - |
12 | | - |
13 | | -## Training |
14 | | - |
15 | | - |
16 | | - rx6600xt/8gb batch size rocm/hip opencl Raito % |
17 | | - alexnet 64 57.848 82.381 70.2 |
18 | | - resnet18 64 146.917 238.889 61.5 |
19 | | - resnet50 32 266.441 357.985 74.4 |
20 | | - convnext_small 16 337.252 583.794 57.8 |
21 | | - vgg16 16 206.312 348.692 59.2 |
22 | | - densenet161 16 296.807 485.035 61.2 |
23 | | - mobilenet_v2 32 157.476 197.886 79.6 |
24 | | - mobilenet_v3_small 64 92.506 120.406 76.8 |
25 | | - mobilenet_v3_large 64 286.795 319.938 89.6 |
26 | | - resnext50_32x4d 32 336.464 491.112 68.5 |
27 | | - wide_resnet50_2 32 466.841 642.973 72.6 |
28 | | - mnasnet1_0 32 159.97 167.306 95.6 |
29 | | - efficientnet_b0 32 205.69 305.157 67.4 |
30 | | - regnet_y_400mf 64 171.691 244.587 70.2 |
31 | | - |
32 | | - Average 71.8 |
33 | | - |
34 | | - gtx960/4gb batch size c cuda opencl Raito % |
35 | | - alexnet 64 128.142 270.006 47.5 |
36 | | - resnet18 64 415.589 746.578 55.7 |
37 | | - resnet50 16 373.932 599.182 62.4 |
38 | | - convnext_small 8 1128.995 1175.585 96.0 |
39 | | - vgg16 8 364.176 561.695 64.8 |
40 | | - densenet161 8 463.427 728.693 63.6 |
41 | | - mobilenet_v2 16 173.13 352.728 49.1 |
42 | | - mobilenet_v3_small 32 101.621 206.353 49.2 |
43 | | - mobilenet_v3_large 32 263.055 523.575 50.2 |
44 | | - resnext50_32x4d 16 539.007 846.71 63.7 |
45 | | - wide_resnet50_2 16 677.57 1040.154 65.1 |
46 | | - mnasnet1_0 16 167.542 322.004 52.0 |
47 | | - efficientnet_b0 16 241.023 540.09 44.6 |
48 | | - regnet_y_400mf 32 353.889 391.025 90.5 |
49 | | - |
50 | | - Average 61.0 |
51 | | - |
52 | | -## Inference |
53 | | - |
54 | | -Note, since my AMD and Nvidia gpus have different memory size differnet |
55 | | -batch sizes were used |
56 | | - |
57 | | - |
58 | | - rx6600xt/8gb rocm/hip opencl Ratio % Batch=64 |
59 | | - convnext_small 476.549 600.921 79.3 |
60 | | - alexnet 24.587 26.311 93.4 |
61 | | - resnet18 41.375 59.375 69.7 |
62 | | - resnet50 165.261 194.512 85.0 |
63 | | - vgg16 205.124 309.937 66.2 |
64 | | - densenet161 409.38 414.496 98.8 |
65 | | - inception_v3 90.635 131.685 68.8 |
66 | | - mobilenet_v2 77.691 93.701 82.9 |
67 | | - mobilenet_v3_small 22.203 26.151 84.9 |
68 | | - mobilenet_v3_large 63.229 70.458 89.7 |
69 | | - resnext50_32x4d 244.676 274.791 89.0 |
70 | | - wide_resnet50_2 320.313 402.687 79.5 |
71 | | - mnasnet1_0 74.141 75.162 98.6 |
72 | | - efficientnet_b0 104.396 114.898 90.9 |
73 | | - efficientnet_b4 303.468 276.226 109.9 |
74 | | - regnet_y_400mf 43.298 57.491 75.3 |
75 | | - |
76 | | - Average 85.1 |
77 | | - |
78 | | - gtx960/4gb cuda opencl Ratio % Batch=32 |
79 | | - convnext_small 751.713 1206.871 62.3 |
80 | | - alexnet 29.446 44.27 66.5 |
81 | | - resnet18 66.053 93.352 70.8 |
82 | | - resnet50 214.787 316.754 67.8 |
83 | | - vgg16 350.278 486.743 72.0 |
84 | | - densenet161 511.183 587.856 87.0 |
85 | | - inception_v3 167.233 217.664 76.8 |
86 | | - mobilenet_v2 86.572 161.797 53.5 |
87 | | - mobilenet_v3_small 27.748 49.359 56.2 |
88 | | - mobilenet_v3_large 68.79 121.644 56.6 |
89 | | - resnext50_32x4d 284.697 440.466 64.6 |
90 | | - wide_resnet50_2 376.114 587.801 64.0 |
91 | | - mnasnet1_0 82.576 132.463 62.3 |
92 | | - efficientnet_b0 111.154 202.593 54.9 |
93 | | - efficientnet_b4 299.779 499.841 60.0 |
94 | | - regnet_y_400mf 99.336 95.446 104.1 |
95 | | - |
96 | | - Average 67.5 |
97 | | - |
98 | | - |
99 | | - |
100 | | - |
101 | | - |
102 | | - |
103 | | - |
104 | | - |
105 | | - |
106 | | - |
107 | | - |
108 | | - |
109 | | - |
110 | | - |
111 | | - |
112 | | - |
113 | | - |
114 | | - |
115 | | - |
116 | | - |
117 | | - |
118 | | - |
119 | | - |
120 | | - |
121 | | - |
122 | | - |
123 | | - |
124 | | - |
125 | | - |
126 | | - |
127 | | - |
128 | | - |
129 | | - |
130 | | - |
131 | | - |
132 | | - |
133 | | - |
134 | | - |
135 | | - |
136 | | - |
137 | | - |
138 | | - |
139 | | - |
140 | | - |
141 | | - |
142 | | - |
143 | | - ppppppppppppppppppp |
| 1 | +# Setup |
| 2 | + |
| 3 | +Tested 3 setups, pytorch 2.4 |
| 4 | + |
| 5 | +1. AMD rx6600XT, OpenCL drivers vs official ROCM pytorch (6.1) |
| 6 | +2. NVidia rx960, OpenCL drivers vs official CUDA 12.2 |
| 7 | +3. Inter Arc A380, OpenCL NEO driver vs XPU - intel extension for pytorch (2.1 since it what was released) |
| 8 | + |
| 9 | +Input is standard Image net batchx3x224x224, time in milliseconds, lower is better. |
| 10 | + |
| 11 | +# Training |
| 12 | + |
| 13 | + |
| 14 | + |
| 15 | +|AMD||||||Nvidia||||||Intel||||| |
| 16 | +|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-| |
| 17 | +|rx6600xt|batch|OpenCL|ROCM|% Perf||gtx960|batch|OpenCL|CUDA|% Perf||A380|batch|OpenCL|XPU|% Perf| |
| 18 | +|alexnet|64|75.239|57.957|77||alexnet|64|257.09|130.561|51||alexnet|64|482.139|133.512|28| |
| 19 | +|resnet18|64|238.927|147.099|62||resnet18|64|695.096|419.69|60||resnet18|64|1044.985|397.738|38| |
| 20 | +|resnet50|32|358.872|266.155|74||resnet50|16|591.143|375.644|64||resnet50|16|640.916|329.849|51| |
| 21 | +|convnext_small|16|608.297|337.736|56||convnext_small|8|1001.294|1120.676|112||convnext_small|8|841.302|259.292|31| |
| 22 | +|vgg16|16|343.962|206.243|60||vgg16|8|520.75|363.288|70||vgg16|8|780.692|479.314|61| |
| 23 | +|densenet161|16|494.175|297.001|60||densenet161|8|698.842|464.051|66||densenet161|8|834.207|423.883|51| |
| 24 | +|mobilenet_v2|32|206.255|157.743|76||mobilenet_v2|16|335.279|173.748|52||mobilenet_v2|16|405.541|153.694|38| |
| 25 | +|mobilenet_v3_small|64|130.571|92.83|71||mobilenet_v3_small|32|196.173|102.561|52||mobilenet_v3_small|32|275.302|92.086|33| |
| 26 | +|mobilenet_v3_large|64|330.269|287.3|87||mobilenet_v3_large|32|497.168|264.072|53||mobilenet_v3_large|32|642.568|226.292|35| |
| 27 | +|resnext50_32x4d|32|490.971|336.183|68||resnext50_32x4d|16|807.178|539.026|67||resnext50_32x4d|16|1068.918|396.39|37| |
| 28 | +|wide_resnet50_2|32|643.083|468.04|73||wide_resnet50_2|16|1023.105|677.723|66||wide_resnet50_2|16|1373.346|634.213|46| |
| 29 | +|mnasnet1_0|32|167.934|160.254|95||mnasnet1_0|16|302.854|167.911|55||mnasnet1_0|16|383.069|126.56|33| |
| 30 | +|efficientnet_b0|32|313.972|205.674|66||efficientnet_b0|16|515.058|241.311|47||efficientnet_b0|16|531.724|203.157|38| |
| 31 | +|regnet_y_400mf|64|246.069|171.841|70||regnet_y_400mf|32|361.507|353.584|98||regnet_y_400mf|32|635.279|224.228|35| |
| 32 | +|Average||||71||Average||||65||Average||||40| |
| 33 | + |
| 34 | +# Inference |
| 35 | + |
| 36 | + |
| 37 | +|AMD||||||Nvidia||||||Intel||||| |
| 38 | +|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-| |
| 39 | +|rx6600xt|batch|OpenCL|ROCM|% Perf||gtx960|batch|OpenCL|CUDA|% Perf||A380|batch|OpenCL|XPU|% Perf| |
| 40 | +|alexnet|64|24.543|24.642|100||alexnet|32|45.007|30.271|67||alexnet|32|55.5|25.835|47| |
| 41 | +|resnet18|64|59.428|41.569|70||resnet18|32|94.044|66.61|71||resnet18|32|113.002|55.647|49| |
| 42 | +|resnet50|64|196.75|165.706|84||resnet50|32|316.899|215.245|68||resnet50|32|271.778|145.842|54| |
| 43 | +|convnext_small|64|632.215|478.088|76||convnext_small|32|881.586|751.286|85||convnext_small|32|670.291|294.405|44| |
| 44 | +|vgg16|64|310.767|205.745|66||vgg16|32|490.68|351.488|72||vgg16|32|801.684|333.954|42| |
| 45 | +|densenet161|64|415.707|410.906|99||densenet161|32|589.712|510.883|87||densenet161|32|685.154|315.407|46| |
| 46 | +|mobilenet_v2|64|93.699|77.774|83||mobilenet_v2|32|162.4|87.376|54||mobilenet_v2|32|100.363|51.589|51| |
| 47 | +|mobilenet_v3_small|64|25.653|22.253|87||mobilenet_v3_small|32|50.097|28.739|57||mobilenet_v3_small|32|36.92|26.508|72| |
| 48 | +|mobilenet_v3_large|64|70.409|63.28|90||mobilenet_v3_large|32|122.416|69.432|57||mobilenet_v3_large|32|84.413|52.328|62| |
| 49 | +|resnext50_32x4d|64|274.967|245.411|89||resnext50_32x4d|32|440.411|284.571|65||resnext50_32x4d|32|359.037|169.194|47| |
| 50 | +|wide_resnet50_2|64|404.214|321.398|80||wide_resnet50_2|32|589.164|376.938|64||wide_resnet50_2|32|682.184|321.014|47| |
| 51 | +|mnasnet1_0|64|75.027|74.211|99||mnasnet1_0|32|133.324|83.407|63||mnasnet1_0|32|91.441|51.785|57| |
| 52 | +|efficientnet_b0|64|114.735|104.417|91||efficientnet_b0|32|203.531|111.822|55||efficientnet_b0|32|129.755|88.131|68| |
| 53 | +|regnet_y_400mf|64|57.408|43.313|75||regnet_y_400mf|32|96.079|99.022|103||regnet_y_400mf|32|87.756|56.503|64| |
| 54 | +|Average||||85||Average||||69||Average||||54| |
0 commit comments