@@ -1116,61 +1116,60 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
1116
1116
return GGML_STATUS_SUCCESS;
1117
1117
}
1118
1118
1119
- static int CreateAclTensorWeight (const void *hostData, const std::vector<int64_t > &shape, void **deviceAddr,
1120
- aclDataType dataType, aclTensor **tensor)
1121
- {
1122
- uint64_t size = 1 ;
1123
- for (auto i : shape) {
1124
- size *= i;
1119
+ // ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed
1120
+ namespace {
1121
+ void * g_nz_workspace = nullptr ;
1122
+ size_t g_nz_workspace_allocated = 0 ;
1123
+
1124
+ void release_nz_workspace () {
1125
+ if (g_nz_workspace) {
1126
+ aclrtFree (g_nz_workspace);
1127
+ g_nz_workspace = nullptr ;
1128
+ g_nz_workspace_allocated = 0 ;
1129
+ }
1125
1130
}
1126
1131
1127
- const aclIntArray *mat2Size = aclCreateIntArray (shape.data (), shape.size ());
1128
- ACL_CHECK (aclnnCalculateMatmulWeightSizeV2 (mat2Size, dataType, &size));
1129
-
1130
- size *= sizeof (int16_t );
1131
-
1132
- ACL_CHECK (aclrtMalloc (deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
1133
- aclrtMemcpy (*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
1134
-
1135
- std::vector<int64_t > strides (shape.size (), 1 );
1136
- for (int64_t i = shape.size () - 2 ; i >= 0 ; i--) {
1137
- strides[i] = shape[i + 1 ] * strides[i + 1 ];
1132
+ void relloc_nz_workspace (size_t new_size) {
1133
+ if (new_size > g_nz_workspace_allocated) {
1134
+ if (g_nz_workspace) {
1135
+ aclrtFree (g_nz_workspace);
1136
+ g_nz_workspace = nullptr ;
1137
+ }
1138
+ ACL_CHECK (aclrtMalloc (&g_nz_workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
1139
+ g_nz_workspace_allocated = new_size;
1140
+ }
1138
1141
}
1139
-
1140
- *tensor = aclCreateTensor (shape.data (), shape.size (), dataType, strides.data (), 0 , aclFormat::ACL_FORMAT_ND,
1141
- shape.data (), shape.size (), *deviceAddr);
1142
- return 0 ;
1143
1142
}
1144
1143
1144
+ /* *
1145
+ * @brief Convert tensor weights to NZ format using Ascend CANN API.
1146
+ *
1147
+ * This function creates a transposed tensor descriptor and performs the
1148
+ * TransMatmulWeight operation. Converting tensor formats can significantly
1149
+ * improve performance on certain hardware.
1150
+ *
1151
+ * @param tensor Pointer to the input ggml_tensor containing the weights.
1152
+ * @param data Pointer to the raw data buffer for the tensor weights.
1153
+ * @param offset Byte offset within the tensor data buffer where weights start.
1154
+ *
1155
+ * @note The workspace buffer used in this function is managed globally and reused
1156
+ * across calls. This reduces overhead from repeated memory allocation and deallocation.
1157
+ */
1145
1158
static void weight_format_to_nz (ggml_tensor *tensor, const void *data, size_t offset) {
1146
- aclrtStream stream;
1147
- ACL_CHECK (aclrtCreateStream (&stream));
1148
-
1149
1159
std::vector<int64_t > weightTransposedShape = {tensor->ne [1 ], tensor->ne [0 ]};
1150
- void *weightTransposedDeviceAddr = nullptr ;
1151
- aclTensor *weightTransposed = nullptr ;
1152
- CreateAclTensorWeight (data, weightTransposedShape, &weightTransposedDeviceAddr,
1153
- ggml_cann_type_mapping (tensor->type ), &weightTransposed);
1154
-
1160
+ aclTensor* weightTransposed = ggml_cann_create_tensor (tensor, tensor->ne ,
1161
+ tensor->nb , 2 , ACL_FORMAT_ND, offset);
1155
1162
uint64_t workspaceSize = 0 ;
1156
1163
aclOpExecutor *executor;
1157
- void *workspaceAddr = nullptr ;
1158
1164
1159
1165
// TransMatmulWeight
1160
- ACL_CHECK (aclnnTransMatmulWeightGetWorkspaceSize (weightTransposed, &workspaceSize, &executor));
1161
- std::unique_ptr<void , aclError (*)(void *)> workspaceAddrPtrTrans (nullptr , aclrtFree);
1162
- if (workspaceSize > 0 ) {
1163
- ACL_CHECK (aclrtMalloc (&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
1164
- workspaceAddrPtrTrans.reset (workspaceAddr);
1165
- }
1166
- ACL_CHECK (aclnnTransMatmulWeight (workspaceAddr, workspaceSize, executor, stream));
1166
+ ACL_CHECK (aclnnTransMatmulWeightGetWorkspaceSize (weightTransposed,
1167
+ &workspaceSize, &executor));
1168
+ // Avoid frequent malloc/free of the workspace.
1169
+ relloc_nz_workspace (workspaceSize);
1167
1170
1168
- size_t size = ggml_nelements (tensor) * ggml_element_size (tensor);
1169
-
1170
- aclrtMemcpy ((char *)tensor->data + offset, size,
1171
- weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
1171
+ ACL_CHECK (aclnnTransMatmulWeight (g_nz_workspace, workspaceSize, executor, nullptr ));
1172
1172
ACL_CHECK (aclDestroyTensor (weightTransposed));
1173
- aclrtFree (weightTransposedDeviceAddr);
1174
1173
}
1175
1174
1176
1175
// TODO: need handle tensor which has paddings.
@@ -1197,14 +1196,14 @@ static void ggml_backend_cann_buffer_set_tensor(
1197
1196
// For acl, synchronous functions use this default stream.
1198
1197
// Why aclrtSynchronizeDevice?
1199
1198
1200
- bool weightToNZ = false ;
1201
- #ifdef ASCEND_310P
1202
- weightToNZ = (getenv (" GGML_CANN_WEIGHT_NZ" ) != nullptr );
1203
- #endif
1199
+ // Only check env once.
1200
+ static bool wight_to_nz = parse_bool (get_env (" GGML_CANN_WEIGHT_NZ" ).value_or (" " ));
1204
1201
if (!need_transform (tensor->type )) {
1205
1202
ACL_CHECK (aclrtMemcpy ((char *)tensor->data + offset, size, data, size,
1206
1203
ACL_MEMCPY_HOST_TO_DEVICE));
1207
- if (weightToNZ && is_matmul_weight ((const ggml_tensor*)tensor)) {
1204
+ if (wight_to_nz && is_matmul_weight ((const ggml_tensor*)tensor)) {
1205
+ GGML_ASSERT (tensor->ne [2 ] == 1 );
1206
+ GGML_ASSERT (tensor->ne [3 ] == 1 );
1208
1207
weight_format_to_nz (tensor, data, offset);
1209
1208
}
1210
1209
} else {
@@ -1440,20 +1439,32 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
1440
1439
size_t size = ggml_nbytes (tensor);
1441
1440
int64_t ne0 = tensor->ne [0 ];
1442
1441
1442
+ // Only check env once.
1443
+ static bool wight_to_nz = parse_bool (get_env (" GGML_CANN_WEIGHT_NZ" ).value_or (" " ));
1444
+
1443
1445
// last line must bigger than 32, because every single op deal at
1444
1446
// least 32 bytes.
1445
1447
// TODO: quantized type?
1446
1448
// int64_t line_size = ne0 * ggml_element_size(tensor);
1447
1449
// int64_t line_size_align_32 = (line_size + 31) & ~31;
1448
1450
// size += (line_size_align_32 - line_size);
1449
-
1450
- // TODO: not support quantized yet.
1451
- // TODO: consider un-continue tensor.
1452
1451
if (ggml_is_quantized (tensor->type )) {
1453
1452
if (ne0 % MATRIX_ROW_PADDING != 0 ) {
1454
1453
size += ggml_row_size (
1455
1454
tensor->type , MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
1456
1455
}
1456
+ } else if (wight_to_nz && is_matmul_weight ((const ggml_tensor*)tensor)) {
1457
+ // NZ format weight are not support quantized yet.
1458
+ // If ND tensor transform to NZ, size may changed.
1459
+ int64_t shape[] = {tensor->ne [1 ], tensor->ne [0 ]};
1460
+ GGML_ASSERT (tensor->ne [2 ] == 1 );
1461
+ GGML_ASSERT (tensor->ne [3 ] == 1 );
1462
+ const aclIntArray *acl_shape = aclCreateIntArray (shape, 2 );
1463
+ size_t new_size;
1464
+ ACL_CHECK (aclnnCalculateMatmulWeightSizeV2 (acl_shape,
1465
+ ggml_cann_type_mapping (tensor->type ), &new_size));
1466
+ ACL_CHECK (aclDestroyIntArray (acl_shape));
1467
+ size = std::max (size, new_size);
1457
1468
}
1458
1469
1459
1470
return size;
@@ -2080,6 +2091,8 @@ static enum ggml_status ggml_backend_cann_graph_compute(
2080
2091
(ggml_backend_cann_context*)backend->context ;
2081
2092
2082
2093
ggml_cann_set_device (cann_ctx->device );
2094
+ // release temp buffer create when load model.
2095
+ release_nz_workspace ();
2083
2096
2084
2097
for (int i = 0 ; i < cgraph->n_nodes ; i++) {
2085
2098
ggml_tensor* node = cgraph->nodes [i];
0 commit comments