diff --git a/tutorial/hello_world/README.md b/tutorial/hello_world/README.md index 83257423..e4eb11cc 100644 --- a/tutorial/hello_world/README.md +++ b/tutorial/hello_world/README.md @@ -22,7 +22,7 @@ This folder contains the following files: Before running this example, ensure that you have followed the Ryzen AI Installation instructions found [here](https://ryzenai.docs.amd.com/en/latest/inst.html) and have activated the conda environment created during installation. -Create a clone of the Ryzen AI installation conda environment to add required python packages +Create a clone of the Ryzen AI installation conda environment and activate it. ```python set RYZEN_AI_CONDA_ENV_NAME=ryzen-ai- diff --git a/tutorial/hello_world/hello_world.ipynb b/tutorial/hello_world/hello_world.ipynb index 6ba558e3..19dc0733 100644 --- a/tutorial/hello_world/hello_world.ipynb +++ b/tutorial/hello_world/hello_world.ipynb @@ -80,7 +80,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -93,7 +93,7 @@ "import onnx\n", "import shutil\n", "from timeit import default_timer as timer\n", - "import vai_q_onnx" + "#import vai_q_onnx" ] }, { @@ -112,7 +112,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "APU Type: PHX/HPT\n" + "NPU Type: KRK\n" ] } ], @@ -129,10 +129,11 @@ " if 'PCI\\\\VEN_1022&DEV_17F0&REV_00' in stdout.decode(): npu_type = 'STX'\n", " if 'PCI\\\\VEN_1022&DEV_17F0&REV_10' in stdout.decode(): npu_type = 'STX'\n", " if 'PCI\\\\VEN_1022&DEV_17F0&REV_11' in stdout.decode(): npu_type = 'STX'\n", + " if 'PCI\\\\VEN_1022&DEV_17F0&REV_20' in stdout.decode(): npu_type = 'KRK'\n", " return npu_type\n", "\n", "npu_type = get_npu_info()\n", - "print(f\"NPU Type: {npu_type}\")" + "print(f\"APU Type: {npu_type}\")" ] }, { @@ -144,8 +145,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Setting environment for PHX/HPT\n", - "XLNX_VART_FIRMWARE= C:\\Program Files\\RyzenAI\\1.2.0\\voe-4.0-win_amd64\\xclbins\\phoenix\\1x4.xclbin\n", + "Setting environment for STX/KRK\n", + "XLNX_VART_FIRMWARE= C:\\Program Files\\RyzenAI\\1.6.0\\voe-4.0-win_amd64\\xclbins\\strix\\AMD_AIE2P_4x4_Overlay.xclbin\n", "NUM_OF_DPU_RUNNERS= 1\n", "XLNX_TARGET_NAME= AMD_AIE2_Nx4_Overlay\n" ] @@ -165,19 +166,19 @@ " os.environ['XLNX_VART_FIRMWARE']= os.path.join(install_dir, 'voe-4.0-win_amd64', 'xclbins', 'phoenix', '4x4.xclbin')\n", " os.environ['NUM_OF_DPU_RUNNERS']='1'\n", " os.environ['XLNX_TARGET_NAME']='AMD_AIE2_Nx4_Overlay'\n", - " case 'STX':\n", - " print(\"Setting environment for STX\")\n", + " case 'STX' | 'KRK':\n", + " print(\"Setting environment for STX/KRK\")\n", " os.environ['XLNX_VART_FIRMWARE']= os.path.join(install_dir, 'voe-4.0-win_amd64', 'xclbins', 'strix', 'AMD_AIE2P_4x4_Overlay.xclbin')\n", " os.environ['NUM_OF_DPU_RUNNERS']='1'\n", " os.environ['XLNX_TARGET_NAME']='AMD_AIE2_Nx4_Overlay'\n", " case _:\n", - " print(\"Unrecognized NPU type. Exiting.\")\n", + " print(\"Unrecognized APU type. Exiting.\")\n", " exit()\n", " print('XLNX_VART_FIRMWARE=', os.environ['XLNX_VART_FIRMWARE'])\n", " print('NUM_OF_DPU_RUNNERS=', os.environ['NUM_OF_DPU_RUNNERS'])\n", " print('XLNX_TARGET_NAME=', os.environ['XLNX_TARGET_NAME'])\n", "\n", - "set_environment_variable(apu_type)" + "set_environment_variable(npu_type)" ] }, { @@ -190,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -256,9 +257,158 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\kfreidank\\AppData\\Local\\Temp\\ipykernel_18628\\2861012411.py:13: UserWarning: # 'dynamic_axes' is not recommended when dynamo=True, and may lead to 'torch._dynamo.exc.UserError: Constraints violated.' Supply the 'dynamic_shapes' argument instead if export is unsuccessful.\n", + " torch.onnx.export(\n", + "W1213 23:10:49.984000 18628 site-packages\\torch\\onnx\\_internal\\exporter\\_compat.py:114] Setting ONNX exporter to use operator set version 18 because the requested opset_version 17 is a lower version than we have implementations for. Automatic version conversion will be performed, which may not be successful at converting to the requested version. If version conversion is unsuccessful, the opset version of the exported model will be kept at 18. Please consider setting opset_version >=18 to leverage latest ONNX features\n", + "W1213 23:10:51.139000 18628 site-packages\\torch\\onnx\\_internal\\exporter\\_registration.py:107] torchvision is not installed. Skipping torchvision::nms\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[torch.onnx] Obtain model graph for `SmallModel([...]` with `torch.export.export(..., strict=False)`...\n", + "[torch.onnx] Obtain model graph for `SmallModel([...]` with `torch.export.export(..., strict=False)`... ✅\n", + "[torch.onnx] Run decomposition...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The model version conversion is not supported by the onnxscript version converter and fallback is enabled. The model will be converted using the onnx C API (target version: 17).\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[torch.onnx] Run decomposition... ✅\n", + "[torch.onnx] Translate the graph into ONNX...\n", + "[torch.onnx] Translate the graph into ONNX... ✅\n" + ] + }, + { + "data": { + "text/plain": [ + "ONNXProgram(\n", + " model=\n", + " <\n", + " ir_version=10,\n", + " opset_imports={'': 17},\n", + " producer_name='pytorch',\n", + " producer_version='2.9.1+cpu',\n", + " domain=None,\n", + " model_version=None,\n", + " >\n", + " graph(\n", + " name=main_graph,\n", + " inputs=(\n", + " %\"input\"\n", + " ),\n", + " outputs=(\n", + " %\"output\"\n", + " ),\n", + " initializers=(\n", + " %\"conv1.weight\"{TorchTensor(...)},\n", + " %\"conv1.bias\"{TorchTensor(...)},\n", + " %\"conv2.bias\"{TorchTensor(...)},\n", + " %\"conv3.bias\"{TorchTensor(...)},\n", + " %\"conv4.bias\"{TorchTensor(...)},\n", + " %\"conv2.weight\"{TorchTensor(...)},\n", + " %\"conv3.weight\"{TorchTensor(...)},\n", + " %\"conv4.weight\"{TorchTensor(...)},\n", + " %\"scalar_tensor_default\"{Tensor(array(1., dtype=float32), name='scalar_tensor_default')}\n", + " ),\n", + " ) {\n", + " 0 | # node_conv2d\n", + " %\"conv2d\" ⬅️ ::Conv(%\"input\", %\"conv1.weight\"{...}, %\"conv1.bias\"{...}) {group=1, auto_pad='NOTSET', dilations=(1, 1), strides=(1, 1), pads=(1, 1, 1, 1)}\n", + " 1 | # node_relu\n", + " %\"relu\" ⬅️ ::Relu(%\"conv2d\")\n", + " 2 | # node_conv2d_1\n", + " %\"conv2d_1\" ⬅️ ::Conv(%\"relu\", %\"conv2.weight\"{...}, %\"conv2.bias\"{...}) {group=1, auto_pad='NOTSET', dilations=(1, 1), strides=(1, 1), pads=(1, 1, 1, 1)}\n", + " 3 | # node_relu_1\n", + " %\"relu_1\" ⬅️ ::Relu(%\"conv2d_1\")\n", + " 4 | # node_conv2d_2\n", + " %\"conv2d_2\" ⬅️ ::Conv(%\"relu_1\", %\"conv3.weight\"{...}, %\"conv3.bias\"{...}) {group=1, auto_pad='NOTSET', dilations=(1, 1), strides=(1, 1), pads=(1, 1, 1, 1)}\n", + " 5 | # node_relu_2\n", + " %\"relu_2\" ⬅️ ::Relu(%\"conv2d_2\")\n", + " 6 | # node_conv2d_3\n", + " %\"conv2d_3\" ⬅️ ::Conv(%\"relu_2\", %\"conv4.weight\"{...}, %\"conv4.bias\"{...}) {group=1, auto_pad='NOTSET', dilations=(1, 1), strides=(1, 1), pads=(1, 1, 1, 1)}\n", + " 7 | # node_relu_3\n", + " %\"relu_3\" ⬅️ ::Relu(%\"conv2d_3\")\n", + " 8 | # node_add_40\n", + " %\"output\" ⬅️ ::Add(%\"relu_3\", %\"scalar_tensor_default\"{1.0})\n", + " return %\"output\"\n", + " }\n", + "\n", + "\n", + " ,\n", + " exported_program=\n", + " ExportedProgram:\n", + " class GraphModule(torch.nn.Module):\n", + " def forward(self, p_conv1_weight: \"f32[32, 3, 3, 3]\", p_conv1_bias: \"f32[32]\", p_conv2_weight: \"f32[64, 32, 3, 3]\", p_conv2_bias: \"f32[64]\", p_conv3_weight: \"f32[128, 64, 3, 3]\", p_conv3_bias: \"f32[128]\", p_conv4_weight: \"f32[256, 128, 3, 3]\", p_conv4_bias: \"f32[256]\", x: \"f32[s77, 3, 224, 224]\"):\n", + " # File: c:\\Users\\kfreidank\\miniforge3\\envs\\ryzen-hello\\Lib\\site-packages\\torch\\nn\\modules\\conv.py:548 in forward, code: return self._conv_forward(input, self.weight, self.bias)\n", + " conv2d: \"f32[s77, 32, 224, 224]\" = torch.ops.aten.conv2d.default(x, p_conv1_weight, p_conv1_bias, [1, 1], [1, 1]); x = p_conv1_weight = p_conv1_bias = None\n", + " \n", + " # File: c:\\Users\\kfreidank\\miniforge3\\envs\\ryzen-hello\\Lib\\site-packages\\torch\\nn\\modules\\activation.py:144 in forward, code: return F.relu(input, inplace=self.inplace)\n", + " relu: \"f32[s77, 32, 224, 224]\" = torch.ops.aten.relu.default(conv2d); conv2d = None\n", + " \n", + " # File: c:\\Users\\kfreidank\\miniforge3\\envs\\ryzen-hello\\Lib\\site-packages\\torch\\nn\\modules\\conv.py:548 in forward, code: return self._conv_forward(input, self.weight, self.bias)\n", + " conv2d_1: \"f32[s77, 64, 224, 224]\" = torch.ops.aten.conv2d.default(relu, p_conv2_weight, p_conv2_bias, [1, 1], [1, 1]); relu = p_conv2_weight = p_conv2_bias = None\n", + " \n", + " # File: c:\\Users\\kfreidank\\miniforge3\\envs\\ryzen-hello\\Lib\\site-packages\\torch\\nn\\modules\\activation.py:144 in forward, code: return F.relu(input, inplace=self.inplace)\n", + " relu_1: \"f32[s77, 64, 224, 224]\" = torch.ops.aten.relu.default(conv2d_1); conv2d_1 = None\n", + " \n", + " # File: c:\\Users\\kfreidank\\miniforge3\\envs\\ryzen-hello\\Lib\\site-packages\\torch\\nn\\modules\\conv.py:548 in forward, code: return self._conv_forward(input, self.weight, self.bias)\n", + " conv2d_2: \"f32[s77, 128, 224, 224]\" = torch.ops.aten.conv2d.default(relu_1, p_conv3_weight, p_conv3_bias, [1, 1], [1, 1]); relu_1 = p_conv3_weight = p_conv3_bias = None\n", + " \n", + " # File: c:\\Users\\kfreidank\\miniforge3\\envs\\ryzen-hello\\Lib\\site-packages\\torch\\nn\\modules\\activation.py:144 in forward, code: return F.relu(input, inplace=self.inplace)\n", + " relu_2: \"f32[s77, 128, 224, 224]\" = torch.ops.aten.relu.default(conv2d_2); conv2d_2 = None\n", + " \n", + " # File: c:\\Users\\kfreidank\\miniforge3\\envs\\ryzen-hello\\Lib\\site-packages\\torch\\nn\\modules\\conv.py:548 in forward, code: return self._conv_forward(input, self.weight, self.bias)\n", + " conv2d_3: \"f32[s77, 256, 224, 224]\" = torch.ops.aten.conv2d.default(relu_2, p_conv4_weight, p_conv4_bias, [1, 1], [1, 1]); relu_2 = p_conv4_weight = p_conv4_bias = None\n", + " \n", + " # File: c:\\Users\\kfreidank\\miniforge3\\envs\\ryzen-hello\\Lib\\site-packages\\torch\\nn\\modules\\activation.py:144 in forward, code: return F.relu(input, inplace=self.inplace)\n", + " relu_3: \"f32[s77, 256, 224, 224]\" = torch.ops.aten.relu.default(conv2d_3); conv2d_3 = None\n", + " \n", + " # File: C:\\Users\\kfreidank\\AppData\\Local\\Temp\\ipykernel_18628\\666140956.py:25 in forward, code: x = torch.add(x, 1)\n", + " scalar_tensor_default: \"f32[]\" = torch.ops.aten.scalar_tensor.default(1, dtype = torch.float32)\n", + " add_40: \"f32[1, 256, 224, 224]\" = torch.ops.aten.add.Tensor(relu_3, scalar_tensor_default); relu_3 = scalar_tensor_default = None\n", + " return (add_40,)\n", + " \n", + " Graph signature: \n", + " # inputs\n", + " p_conv1_weight: PARAMETER target='conv1.weight'\n", + " p_conv1_bias: PARAMETER target='conv1.bias'\n", + " p_conv2_weight: PARAMETER target='conv2.weight'\n", + " p_conv2_bias: PARAMETER target='conv2.bias'\n", + " p_conv3_weight: PARAMETER target='conv3.weight'\n", + " p_conv3_bias: PARAMETER target='conv3.bias'\n", + " p_conv4_weight: PARAMETER target='conv4.weight'\n", + " p_conv4_bias: PARAMETER target='conv4.bias'\n", + " x: USER_INPUT\n", + " \n", + " # outputs\n", + " add_40: USER_OUTPUT\n", + " \n", + " Range constraints: {s77: VR[0, int_oo]}\n", + "\n", + ")" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Generate dummy input data\n", "batch_size = 1\n", @@ -274,7 +424,7 @@ "# Call export function\n", "torch.onnx.export(\n", " pytorch_model,\n", - " inputs,\n", + " dummy_input,\n", " tmp_model_path,\n", " export_params=True,\n", " opset_version=17, # Recommended opset\n", @@ -284,6 +434,11 @@ " )" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -295,41 +450,49 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "INFO:vai_q_onnx.quantize:calibration_data_reader is None, using random data for calibration\n", - "INFO:vai_q_onnx.quant_utils:The input ONNX model models/helloworld.onnx can create InferenceSession successfully\n", - "INFO:vai_q_onnx.quant_utils:Random input name input shape [1, 3, 224, 224] type \n", - "INFO:vai_q_onnx.quant_utils:Obtained calibration data with 1 iters\n", - "INFO:vai_q_onnx.quantize:Removed initializers from input\n", - "INFO:vai_q_onnx.quantize:Simplified model sucessfully\n", - "INFO:vai_q_onnx.quantize:Loading model...\n" + "\u001b[32m\n", + "[QUARK-INFO]: Checking custom ops library ...\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: The CPU version of custom ops library already exists.\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: Checked custom ops library.\u001b[0m\n", + "c:\\Users\\kfreidank\\miniforge3\\envs\\ryzen-hello\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "\u001b[32m\n", + "[QUARK-INFO]: The input ONNX model can create InferenceSession successfully\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: Random input name input shape [1, 3, 224, 224] type \u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: Obtained calibration data with 1 iters\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "[VAI_Q_ONNX_INFO]: Time information:\n", - "2024-08-23 10:12:35.362481\n", - "[VAI_Q_ONNX_INFO]: OS and CPU information:\n", + "The configuration of the quantization is Config(global_quant_config=QuantizationConfig(calibrate_method=, quant_format=, activation_type=, weight_type=, input_nodes=[], output_nodes=[], op_types_to_quantize=[], nodes_to_quantize=[], extra_op_types_to_quantize=[], nodes_to_exclude=[], subgraphs_to_exclude=[], specific_tensor_precision=False, execution_providers=['CPUExecutionProvider'], per_channel=False, reduce_range=False, optimize_model=True, use_dynamic_quant=False, use_external_data_format=False, convert_fp16_to_fp32=False, convert_nchw_to_nhwc=False, include_sq=False, include_rotation=False, include_cle=True, include_auto_mp=False, include_fast_ft=False, enable_npu_cnn=True, enable_npu_transformer=False, debug_mode=False, crypto_mode=False, print_summary=True, ignore_warnings=True, log_severity_level=1, extra_options={'ActivationSymmetric': True, 'UseRandomData': True}))\n", + "[QUARK_INFO]: Time information:\n", + "2025-12-13 23:11:15.395783\n", + "[QUARK_INFO]: OS and CPU information:\n", " system --- Windows\n", - " node --- vgodsoe-ryzen\n", - " release --- 10\n", - " version --- 10.0.26100\n", + " node --- windel\n", + " release --- 11\n", + " version --- 10.0.26200\n", " machine --- AMD64\n", - " processor --- AMD64 Family 25 Model 116 Stepping 1, AuthenticAMD\n", - "[VAI_Q_ONNX_INFO]: Tools version information:\n", - " python --- 3.10.14\n", - " onnx --- 1.16.2\n", - " onnxruntime --- 1.17.0\n", - " vai_q_onnx --- 1.17.0+511d6f4\n", - "[VAI_Q_ONNX_INFO]: Quantized Configuration information:\n", + " processor --- AMD64 Family 26 Model 96 Stepping 0, AuthenticAMD\n", + "[QUARK_INFO]: Tools version information:\n", + " python --- 3.12.11\n", + " onnx --- 1.18.0\n", + " onnxruntime --- 1.23.0.dev20250928\n", + " quark.onnx --- 0.10+db671e3+db671e3\n", + "[QUARK_INFO]: Quantized Configuration information:\n", " model_input --- models/helloworld.onnx\n", " model_output --- models/helloworld_quantized.onnx\n", " calibration_data_reader --- None\n", @@ -338,43 +501,72 @@ " input_nodes --- []\n", " output_nodes --- []\n", " op_types_to_quantize --- []\n", - " random_data_reader_input_shape --- []\n", + " extra_op_types_to_quantize --- []\n", " per_channel --- False\n", " reduce_range --- False\n", " activation_type --- QUInt8\n", " weight_type --- QInt8\n", " nodes_to_quantize --- []\n", " nodes_to_exclude --- []\n", + " subgraphs_to_exclude --- []\n", " optimize_model --- True\n", " use_external_data_format --- False\n", " calibrate_method --- PowerOfTwoMethod.MinMSE\n", " execution_providers --- ['CPUExecutionProvider']\n", - " enable_ipu_cnn --- True\n", - " enable_ipu_transformer --- False\n", + " enable_npu_cnn --- True\n", + " enable_npu_transformer --- False\n", " specific_tensor_precision --- False\n", " debug_mode --- False\n", " convert_fp16_to_fp32 --- False\n", " convert_nchw_to_nhwc --- False\n", - " include_cle --- False\n", + " include_cle --- True\n", " include_sq --- False\n", + " include_rotation --- False\n", " include_fast_ft --- False\n", - " extra_options --- {'ActivationSymmetric': True}\n" + " extra_options --- {'ActivationSymmetric': True, 'UseRandomData': True}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:vai_q_onnx.quant_utils:The input ONNX model C:/Users/vgods/AppData/Local/Temp/vai.simp.kpf9kmm3/model_simp.onnx can run inference successfully\n", - "INFO:vai_q_onnx.quantize:optimize the model for better hardware compatibility.\n", - "INFO:vai_q_onnx.quantize:Start calibration...\n", - "INFO:vai_q_onnx.quantize:Start collecting data, runtime depends on your model size and the number of calibration dataset.\n", - "INFO:vai_q_onnx.calibrate:Finding optimal threshold for each tensor using PowerOfTwoMethod.MinMSE algorithm ...\n", - "INFO:vai_q_onnx.calibrate:Use all calibration data to calculate min mse\n", - "Computing range: 100%|██████████| 10/10 [00:04<00:00, 2.30tensor/s]\n", - "INFO:vai_q_onnx.quantize:Finished the calibration of PowerOfTwoMethod.MinMSE which costs 4.6s\n", - "INFO:vai_q_onnx.qdq_quantizer:Remove QuantizeLinear & DequantizeLinear on certain operations(such as conv-relu).\n", - "INFO:vai_q_onnx.refine:Adjust the quantize info to meet the compiler constraints\n" + "\u001b[32m\n", + "[QUARK-INFO]: Removed initializers from input\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: Simplified model sucessfully\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: Loading model...\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: The input ONNX model can run inference successfully\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: Start CrossLayerEqualization...\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: CrossLayerEqualization pattern num: 3\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: Total CrossLayerEqualization steps: 1\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: CrossLayerEqualization Done.\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: optimize the model for better hardware compatibility.\u001b[0m\n", + "\u001b[33m\n", + "[QUARK-WARNING]: The opset version is 17 < 20. Skipping fusing Gelu.\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: Start calibration...\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: Start collecting data, runtime depends on your model size and the number of calibration dataset.\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: Finding optimal threshold for each tensor using PowerOfTwoMethod.MinMSE algorithm ...\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: Use all calibration data to calculate min mse\u001b[0m\n", + "Computing range: 100%|██████████| 10/10 [00:05<00:00, 1.95tensor/s]\n", + "\u001b[32m\n", + "[QUARK-INFO]: Finished the calibration of PowerOfTwoMethod.MinMSE which costs 5.4s\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: Remove QuantizeLinear & DequantizeLinear on certain operations(such as conv-relu).\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: Adjust the quantize info to meet the compiler constraints\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: The operation types and their corresponding quantities of the input float model is shown in the table below.\u001b[0m\n" ] }, { @@ -385,7 +577,6 @@ "┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n", "│ Conv │ 4 │\n", "│ Relu │ 4 │\n", - "│ Constant │ 1 │\n", "│ Add │ 1 │\n", "├──────────────────────┼──────────────────────────────────┤\n", "│ Quantized model path │ models/helloworld_quantized.onnx │\n", @@ -398,7 +589,6 @@ "┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n", "│ Conv │\u001b[1;38;5;46m \u001b[0m\u001b[1;38;5;46m4 \u001b[0m\u001b[1;38;5;46m \u001b[0m│\n", "│ Relu │\u001b[1;38;5;46m \u001b[0m\u001b[1;38;5;46m4 \u001b[0m\u001b[1;38;5;46m \u001b[0m│\n", - "│ Constant │\u001b[1;38;5;46m \u001b[0m\u001b[1;38;5;46m1 \u001b[0m\u001b[1;38;5;46m \u001b[0m│\n", "│ Add │\u001b[1;38;5;46m \u001b[0m\u001b[1;38;5;46m1 \u001b[0m\u001b[1;38;5;46m \u001b[0m│\n", "├──────────────────────┼──────────────────────────────────┤\n", "│ Quantized model path │\u001b[1;38;5;46m \u001b[0m\u001b[1;38;5;46mmodels/helloworld_quantized.onnx\u001b[0m\u001b[1;38;5;46m \u001b[0m│\n", @@ -408,6 +598,39 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m\n", + "[QUARK-INFO]: The quantized information for all operation types is shown in the table below.\u001b[0m\n", + "\u001b[32m\n", + "[QUARK-INFO]: The discrepancy between the operation types in the quantized model and the float model is due to the application of graph optimization.\u001b[0m\n" + ] + }, + { + "data": { + "text/html": [ + "
┏━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓\n",
+       "┃ Op Type  Activation  Weights  Bias    ┃\n",
+       "┡━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩\n",
+       "│ Conv    │ UINT8(4)    INT8(4)  INT8(4) │\n",
+       "│ Add     │ UINT8(1)                     │\n",
+       "└─────────┴────────────┴─────────┴─────────┘\n",
+       "
\n" + ], + "text/plain": [ + "┏━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mOp Type\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mActivation\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mWeights\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mBias \u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩\n", + "│ Conv │\u001b[1;38;5;46m \u001b[0m\u001b[1;38;5;46mUINT8(4) \u001b[0m\u001b[1;38;5;46m \u001b[0m│\u001b[1;38;5;46m \u001b[0m\u001b[1;38;5;46mINT8(4)\u001b[0m\u001b[1;38;5;46m \u001b[0m│\u001b[1;38;5;46m \u001b[0m\u001b[1;38;5;46mINT8(4)\u001b[0m\u001b[1;38;5;46m \u001b[0m│\n", + "│ Add │\u001b[1;38;5;46m \u001b[0m\u001b[1;38;5;46mUINT8(1) \u001b[0m\u001b[1;38;5;46m \u001b[0m│\u001b[1;38;5;46m \u001b[0m\u001b[1;38;5;46m \u001b[0m\u001b[1;38;5;46m \u001b[0m│\u001b[1;38;5;46m \u001b[0m\u001b[1;38;5;46m \u001b[0m\u001b[1;38;5;46m \u001b[0m│\n", + "└─────────┴────────────┴─────────┴─────────┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", @@ -457,7 +680,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -495,14 +718,14 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Directory deleted successfully. Starting Fresh.\n" + "Directory 'c:\\Users\\kfreidank\\projects\\amd_demos\\RyzenAI-SW\\tutorial\\hello_world\\cache\\hello_cache' does not exist.\n" ] } ], @@ -513,7 +736,7 @@ "directory_path = os.path.join(current_directory, r'cache\\hello_cache')\n", "cache_directory = os.path.join(current_directory, r'cache')\n", "\n", - "# Check if the directory exists and delete it if it does.\n", + "# Check if the directory exists and delete it if it does\n", "if os.path.exists(directory_path):\n", " shutil.rmtree(directory_path)\n", " print(f\"Directory deleted successfully. Starting Fresh.\")\n", @@ -546,10 +769,15 @@ " xclbin_file = os.path.join(install_dir, 'voe-4.0-win_amd64', 'xclbins', 'phoenix', '4x4.xclbin')\n", " provider_options = [{\n", " 'target': 'X1',\n", - " 'xclbin': xclbin_file\n", + " 'xclbin': xclbin_file,\n", + " 'log_level':'info',\n", " }]\n", + " case 'STX' | 'KRK':\n", + " provider_options = [{\n", + " 'log_level':'info',\n", + " }]\n", " case _:\n", - " print(\"Unrecognized NPU type. Exiting.\")\n", + " print(\"Unrecognized APU type. Exiting.\")\n", " exit()\n", "aie_options = onnxruntime.SessionOptions()\n", "\n", @@ -564,7 +792,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -583,15 +811,15 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU Execution Time: 0.11257850000004055\n", - "NPU Execution Time: 0.08555689999997185\n" + "CPU Execution Time: 0.17882769999999937\n", + "NPU Execution Time: 0.20666400000001772\n" ] } ], @@ -617,15 +845,34 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "For 50 iterations of a small model:\n", + "- CPU Execution Time: 8.32508400000006\n", + "- NPU Execution Time: 8.485271200000227\n" + ] + } + ], "source": [ "iterations = 50 # edit this for more or less\n", "\n", + "npu_total = cpu_total = 0\n", "for i in range(iterations):\n", + " start = timer()\n", " npu_results = aie_session.run(None, {'input': input_data})\n", - "\n" + " npu_total += timer() - start\n", + " start = timer()\n", + " cpu_results = cpu_session.run(None, {'input': input_data})\n", + " cpu_total += timer() - start\n", + "\n", + "print(f\"For {iterations} iterations of a small model:\")\n", + "print(f\"- CPU Execution Time: {cpu_total}\")\n", + "print(f\"- NPU Execution Time: {npu_total}\")" ] }, { @@ -638,7 +885,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "ryzen-hello", "language": "python", "name": "python3" }, @@ -652,7 +899,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.12.11" } }, "nbformat": 4, diff --git a/tutorial/hello_world/hello_world.py b/tutorial/hello_world/hello_world.py index a5ecb939..4b60516c 100644 --- a/tutorial/hello_world/hello_world.py +++ b/tutorial/hello_world/hello_world.py @@ -58,7 +58,7 @@ def forward(self, x): # Call export function torch.onnx.export( pytorch_model, - inputs, + dummy_input, tmp_model_path, export_params=True, opset_version=17, # Recommended opset @@ -130,6 +130,7 @@ def forward(self, x): if 'PCI\\VEN_1022&DEV_17F0&REV_00' in stdout.decode(): npu_type = 'STX' if 'PCI\\VEN_1022&DEV_17F0&REV_10' in stdout.decode(): npu_type = 'STX' if 'PCI\\VEN_1022&DEV_17F0&REV_11' in stdout.decode(): npu_type = 'STX' +if 'PCI\\VEN_1022&DEV_17F0&REV_20' in stdout.decode(): npu_type = 'KRK' print(f"APU Type: {npu_type}") @@ -139,8 +140,8 @@ def forward(self, x): case 'PHX/HPT': print("Setting xclbin file for PHX/HPT") xclbin_file = os.path.join(install_dir, 'voe-4.0-win_amd64', 'xclbins', 'phoenix', '4x4.xclbin') - case 'STX': - print("Setting xclbin file for STX") + case 'STX' | 'KRK': + print("Setting xclbin file for STX/KRK") xclbin_file = os.path.join(install_dir, 'voe-4.0-win_amd64', 'xclbins', 'strix', 'AMD_AIE2P_4x4_Overlay.xclbin') case _: print("Unrecognized APU type. Exiting.") @@ -190,3 +191,20 @@ def forward(self, x): print(f"CPU Execution Time: {cpu_total}") print(f"NPU Execution Time: {npu_total}") + + +iterations = 50 # edit this for more or less + +npu_total = cpu_total = 0 +for i in range(iterations): + start = timer() + npu_results = aie_session.run(None, {'input': input_data}) + npu_total += timer() - start + start = timer() + cpu_results = cpu_session.run(None, {'input': input_data}) + cpu_total += timer() - start + +print(f"For {iterations} iterations of a small model:") +print(f"- CPU Execution Time: {cpu_total}") +print(f"- NPU Execution Time: {npu_total}") + diff --git a/tutorial/hello_world/requirements.txt b/tutorial/hello_world/requirements.txt index 2d53cc14..6b0c3baa 100644 --- a/tutorial/hello_world/requirements.txt +++ b/tutorial/hello_world/requirements.txt @@ -1,2 +1,3 @@ torch -ipykernel \ No newline at end of file +ipykernel +onnxscript