Skip to content

Commit f850ad0

Browse files
committed
[FFI] Rename NDArray to Tensor
This PR applies the recent TVM ffi refactor that renames NDArray to Tensor, to align with the PyTorch name convention. Aside from the rename, this PR also updates the device API renaming and the `register_global_func` renaming. --- NOTE: This PR is a breaking changes: - all compiled model libraries are invalidated. Please recompile all models with `mlc_llm compile` or `MLC_JIT_POLICY=REDO`. - `ndarray-cache.json` is now renamed to `tensor-cache.json`. Please apply this rename in your local model directories.
1 parent c009bde commit f850ad0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+644
-622
lines changed

3rdparty/tvm

Submodule tvm updated 675 files

android/MLCChat/app/src/main/java/ai/mlc/mlcchat/AppViewModel.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ class AppViewModel(application: Application) : AndroidViewModel(application) {
5151
companion object {
5252
const val AppConfigFilename = "mlc-app-config.json"
5353
const val ModelConfigFilename = "mlc-chat-config.json"
54-
const val ParamsConfigFilename = "ndarray-cache.json"
54+
const val ParamsConfigFilename = "tensor-cache.json"
5555
const val ModelUrlSuffix = "resolve/main/"
5656
}
5757

android/mlc4j/src/cpp/tvm_runtime.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,23 +23,23 @@
2323
#include <runtime/logging.cc>
2424
#include <runtime/memory/memory_manager.cc>
2525
#include <runtime/module.cc>
26-
#include <runtime/ndarray.cc>
2726
#include <runtime/nvtx.cc>
2827
#include <runtime/opencl/opencl_device_api.cc>
2928
#include <runtime/opencl/opencl_module.cc>
3029
#include <runtime/opencl/opencl_wrapper/opencl_wrapper.cc>
3130
#include <runtime/profiling.cc>
3231
#include <runtime/source_utils.cc>
32+
#include <runtime/tensor.cc>
3333
#include <runtime/thread_pool.cc>
3434
#include <runtime/threading_backend.cc>
3535
#include <runtime/vm/attn_backend.cc>
3636
#include <runtime/vm/builtin.cc>
3737
#include <runtime/vm/bytecode.cc>
3838
#include <runtime/vm/executable.cc>
3939
#include <runtime/vm/kv_state.cc>
40-
#include <runtime/vm/ndarray_cache_support.cc>
4140
#include <runtime/vm/paged_kv_cache.cc>
4241
#include <runtime/vm/rnn_state.cc>
42+
#include <runtime/vm/tensor_cache_support.cc>
4343
#include <runtime/vm/vm.cc>
4444
#include <runtime/workspace_pool.cc>
4545

cpp/json_ffi/conv_template.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ Result<std::vector<Data>> CreatePrompt(const Conversation& conv,
314314
// should be a map, with a "url" key containing the URL, but
315315
// we are just assuming this as the URL for now
316316
std::string base64_image = image_url.substr(image_url.find(",") + 1);
317-
Result<NDArray> image_data_res = LoadImageFromBase64(base64_image);
317+
Result<Tensor> image_data_res = LoadImageFromBase64(base64_image);
318318
if (image_data_res.IsErr()) {
319319
return TResult::Error(image_data_res.UnwrapErr());
320320
}
@@ -326,18 +326,18 @@ Result<std::vector<Data>> CreatePrompt(const Conversation& conv,
326326

327327
int embed_size = (image_size * image_size) / (patch_size * patch_size);
328328

329-
NDArray image_data = image_data_res.Unwrap();
329+
Tensor image_data = image_data_res.Unwrap();
330330
std::vector<int64_t> new_shape = {1, image_size, image_size, 3};
331-
NDArray image_ndarray = image_data.CreateView(new_shape, image_data.DataType());
331+
Tensor image_tensor = image_data.CreateView(new_shape, image_data.DataType());
332332
// TODO: Not sure if commenting will affect other functions. But
333-
// python part will do clip preprocessing. auto image_ndarray =
333+
// python part will do clip preprocessing. auto image_tensor =
334334
// ClipPreprocessor(image_data_res.Unwrap(), image_size, device);
335335
// lazily commit text data
336336
if (pending_text.length() != 0) {
337337
message_list.push_back(TextData(pending_text));
338338
pending_text = "";
339339
}
340-
message_list.push_back(ImageData(image_ndarray, embed_size));
340+
message_list.push_back(ImageData(image_tensor, embed_size));
341341
} else {
342342
return TResult::Error("Unsupported content type: " + it_type->second);
343343
}

cpp/json_ffi/image_utils.cc

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ size_t Base64DecodedSize(const std::string& base64_str) {
5151
return 3 * len / 4 - padding;
5252
}
5353

54-
Result<NDArray> LoadImageFromBase64(const std::string& base64_str) {
55-
using TResult = Result<NDArray>;
54+
Result<Tensor> LoadImageFromBase64(const std::string& base64_str) {
55+
using TResult = Result<Tensor>;
5656
MemoryBufferStream stream(base64_str.c_str(), base64_str.size());
5757
tvm::support::Base64InStream base64_stream(&stream);
5858
size_t decoded_size = Base64DecodedSize(base64_str);
@@ -65,13 +65,13 @@ Result<NDArray> LoadImageFromBase64(const std::string& base64_str) {
6565
if (!image_data) {
6666
return TResult::Error(stbi_failure_reason());
6767
}
68-
auto image_ndarray = NDArray::Empty({height, width, 3}, {kDLUInt, 8, 1}, {kDLCPU, 0});
69-
image_ndarray.CopyFromBytes((void*)image_data, width * height * 3);
68+
auto image_tensor = Tensor::Empty({height, width, 3}, {kDLUInt, 8, 1}, {kDLCPU, 0});
69+
image_tensor.CopyFromBytes((void*)image_data, width * height * 3);
7070
stbi_image_free(image_data);
71-
return TResult::Ok(image_ndarray);
71+
return TResult::Ok(image_tensor);
7272
}
7373

74-
NDArray ClipPreprocessor(NDArray image_data, int target_size, DLDevice device) {
74+
Tensor ClipPreprocessor(Tensor image_data, int target_size, DLDevice device) {
7575
int height = image_data->shape[0];
7676
int width = image_data->shape[1];
7777
// Resize
@@ -143,12 +143,12 @@ NDArray ClipPreprocessor(NDArray image_data, int target_size, DLDevice device) {
143143
}
144144
}
145145

146-
// Create NDArray
147-
auto image_ndarray = NDArray::Empty({1, 3, target_size, target_size}, {kDLFloat, 32, 1}, device);
148-
image_ndarray.CopyFromBytes((void*)image_data_channel_first.data(),
149-
target_size * target_size * 3 * sizeof(float));
146+
// Create Tensor
147+
auto image_tensor = Tensor::Empty({1, 3, target_size, target_size}, {kDLFloat, 32, 1}, device);
148+
image_tensor.CopyFromBytes((void*)image_data_channel_first.data(),
149+
target_size * target_size * 3 * sizeof(float));
150150

151-
return image_ndarray;
151+
return image_tensor;
152152
}
153153

154154
} // namespace json_ffi

cpp/json_ffi/image_utils.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#ifndef MLC_LLM_JSON_FFI_IMAGE_UTILS_H_
77
#define MLC_LLM_JSON_FFI_IMAGE_UTILS_H_
88

9-
#include <tvm/runtime/ndarray.h>
9+
#include <tvm/runtime/tensor.h>
1010

1111
#include <optional>
1212
#include <string>
@@ -17,12 +17,12 @@ namespace mlc {
1717
namespace llm {
1818
namespace json_ffi {
1919

20-
/*! \brief Load a base64 encoded image string into a CPU NDArray of shape {height, width, 3} */
21-
Result<tvm::runtime::NDArray> LoadImageFromBase64(const std::string& base64_str);
20+
/*! \brief Load a base64 encoded image string into a CPU Tensor of shape {height, width, 3} */
21+
Result<tvm::runtime::Tensor> LoadImageFromBase64(const std::string& base64_str);
2222

23-
/*! \brief Preprocess the CPU image for CLIP encoder and return an NDArray on the given device */
24-
tvm::runtime::NDArray ClipPreprocessor(tvm::runtime::NDArray image_data, int target_size,
25-
DLDevice device);
23+
/*! \brief Preprocess the CPU image for CLIP encoder and return an Tensor on the given device */
24+
tvm::runtime::Tensor ClipPreprocessor(tvm::runtime::Tensor image_data, int target_size,
25+
DLDevice device);
2626

2727
} // namespace json_ffi
2828
} // namespace llm

cpp/multi_gpu/builtin.cc

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#include <tvm/node/cast.h>
1313
#include <tvm/runtime/disco/builtin.h>
1414
#include <tvm/runtime/disco/disco_worker.h>
15-
#include <tvm/runtime/ndarray.h>
15+
#include <tvm/runtime/tensor.h>
1616
#include <tvm/runtime/vm/vm.h>
1717

1818
namespace mlc {
@@ -55,7 +55,7 @@ ObjectRef DispatchFunctionByGroup(tvm::ffi::AnyView vm_arg,
5555
return rv.cast<ObjectRef>();
5656
}
5757

58-
ObjectRef SendFromLastGroupToWorker0(NDArray send, Optional<NDArray> recv, Shape shape,
58+
ObjectRef SendFromLastGroupToWorker0(Tensor send, Optional<Tensor> recv, Shape shape,
5959
DataType dtype) {
6060
DiscoWorker* worker = DiscoWorker::ThreadLocal();
6161
int worker_id = worker->worker_id;
@@ -64,18 +64,18 @@ ObjectRef SendFromLastGroupToWorker0(NDArray send, Optional<NDArray> recv, Shape
6464
CHECK_NE(world_size, group_size) << "Cannot perform when there is only one group.";
6565
int sender_id = world_size - group_size;
6666
if (worker_id == 0) {
67-
CHECK(recv.defined()) << "The receive NDArray is undefined for worker 0.";
68-
NDArray recv_arr = recv.value().CreateView(shape, dtype);
67+
CHECK(recv.defined()) << "The receive Tensor is undefined for worker 0.";
68+
Tensor recv_arr = recv.value().CreateView(shape, dtype);
6969
RecvFromWorker(recv_arr, sender_id);
7070
return recv_arr;
7171
} else if (worker_id == sender_id) {
7272
CHECK_EQ(DataType(send->dtype), dtype)
73-
<< "The src NDArray has mismatched dtype than the expected dtype.";
73+
<< "The src Tensor has mismatched dtype than the expected dtype.";
7474
CHECK_EQ(send->ndim, shape.size())
75-
<< "The src NDArray has mismatched shape than the expected shape.";
75+
<< "The src Tensor has mismatched shape than the expected shape.";
7676
for (int i = 0; i < send->ndim; ++i) {
7777
CHECK_EQ(send->shape[i], shape[i])
78-
<< "The src NDArray has mismatched shape than the expected shape.";
78+
<< "The src Tensor has mismatched shape than the expected shape.";
7979
}
8080
SendToWorker(send, /*receiver_id=*/0);
8181
return recv;

cpp/multi_gpu/multi_gpu_loader.cc

Lines changed: 37 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
#include <tvm/runtime/device_api.h>
1212
#include <tvm/runtime/disco/builtin.h>
1313
#include <tvm/runtime/disco/disco_worker.h>
14-
#include <tvm/runtime/vm/ndarray_cache_support.h>
14+
#include <tvm/runtime/vm/tensor_cache_support.h>
1515

1616
#include <chrono>
1717
#include <filesystem>
@@ -31,7 +31,7 @@ namespace llm {
3131
namespace multi_gpu {
3232

3333
using tvm::Device;
34-
using tvm::runtime::vm::NDArrayCacheMetadata;
34+
using tvm::runtime::vm::TensorCacheMetadata;
3535
using namespace tvm::runtime;
3636
using tvm::ffi::Array;
3737
using tvm::ffi::Function;
@@ -76,11 +76,11 @@ class PreprocessorPool {
7676
}
7777
}
7878

79-
NDArray Apply(NDArray param, const ModelMetadata::Param& param_info) const {
79+
Tensor Apply(Tensor param, const ModelMetadata::Param& param_info) const {
8080
for (const ModelMetadata::Param::Preproc& preproc : param_info.preprocs) {
8181
const std::string& func_name = preproc.func_name;
82-
NDArray param_in = param;
83-
param = NDArray::Empty(preproc.out_shape, preproc.out_dtype, param->device);
82+
Tensor param_in = param;
83+
param = Tensor::Empty(preproc.out_shape, preproc.out_dtype, param->device);
8484
ICHECK(preproc_funcs.count(func_name));
8585
DLTensor dl_param_in = *param_in.operator->();
8686
DLTensor dl_param = *param.operator->();
@@ -94,19 +94,19 @@ class PreprocessorPool {
9494
};
9595

9696
struct ParamInfo {
97-
const NDArrayCacheMetadata::FileRecord* file;
98-
const NDArrayCacheMetadata::FileRecord::ParamRecord* param;
97+
const TensorCacheMetadata::FileRecord* file;
98+
const TensorCacheMetadata::FileRecord::ParamRecord* param;
9999
};
100100

101-
NDArray RecvFromGlobalWorker0(Device device, const ModelMetadata::Param& param_info) {
101+
Tensor RecvFromGlobalWorker0(Device device, const ModelMetadata::Param& param_info) {
102102
Shape shape = param_info.preprocs.empty() ? param_info.shape : param_info.preprocs[0].in_shape;
103-
NDArray result = NDArray::Empty(shape, param_info.dtype, device);
103+
Tensor result = Tensor::Empty(shape, param_info.dtype, device);
104104
RecvFromWorker0(result);
105105
return result;
106106
}
107107

108-
NDArray BroadcastOrShardAndScatter(NDArray param, const ModelMetadata::Param& param_info,
109-
int num_shards, const PreprocessorPool& preprocs) {
108+
Tensor BroadcastOrShardAndScatter(Tensor param, const ModelMetadata::Param& param_info,
109+
int num_shards, const PreprocessorPool& preprocs) {
110110
bool needs_sharding = !param_info.preprocs.empty();
111111
if (!needs_sharding) {
112112
BroadcastFromWorker0(param, /*in_group=*/true, param);
@@ -119,22 +119,22 @@ NDArray BroadcastOrShardAndScatter(NDArray param, const ModelMetadata::Param& pa
119119
<< "ValueError: The first dimension of the output shape must be equal to the "
120120
<< "number of shards, but got: " << shape << " and num_shards = " << num_shards;
121121
param = preprocs.Apply(param, param_info);
122-
NDArray result = NDArray::Empty(Shape(shape.begin() + 1, shape.end()), dtype, device);
122+
Tensor result = Tensor::Empty(Shape(shape.begin() + 1, shape.end()), dtype, device);
123123
ScatterFromWorker0(param, /*in_group=*/true, result);
124124
return result;
125125
}
126126

127-
NDArray ReceiveBroadcastedOrSharded(Device device, const ModelMetadata::Param& param_info,
128-
int num_shards) {
127+
Tensor ReceiveBroadcastedOrSharded(Device device, const ModelMetadata::Param& param_info,
128+
int num_shards) {
129129
bool needs_sharding = !param_info.preprocs.empty();
130-
NDArray result;
130+
Tensor result;
131131
if (needs_sharding) {
132132
Shape shape = param_info.preprocs.back().out_shape;
133133
DataType dtype = param_info.preprocs.back().out_dtype;
134-
result = NDArray::Empty(Shape(shape.begin() + 1, shape.end()), dtype, device);
134+
result = Tensor::Empty(Shape(shape.begin() + 1, shape.end()), dtype, device);
135135
ScatterFromWorker0(std::nullopt, /*in_group=*/true, result);
136136
} else {
137-
result = NDArray::Empty(param_info.shape, param_info.dtype, device);
137+
result = Tensor::Empty(param_info.shape, param_info.dtype, device);
138138
BroadcastFromWorker0(result, /*in_group=*/true, result);
139139
}
140140
return result;
@@ -147,8 +147,8 @@ std::string FormatDuration(DurationType duration) {
147147
return os.str();
148148
}
149149

150-
Array<Optional<NDArray>> LoadMultiGPU(const std::string& model_path, Module vm_module,
151-
const std::string& model_config_str) {
150+
Array<Optional<Tensor>> LoadMultiGPU(const std::string& model_path, Module vm_module,
151+
const std::string& model_config_str) {
152152
DiscoWorker* worker = DiscoWorker::ThreadLocal();
153153
Device device = worker->default_device;
154154
int worker_id = worker->worker_id;
@@ -157,7 +157,7 @@ Array<Optional<NDArray>> LoadMultiGPU(const std::string& model_path, Module vm_m
157157
int group_id = worker_id / group_size;
158158
LOG(INFO) << "[Worker #" << worker_id << "] Loading model to device: " << device;
159159
// Step 0. Initialize metadata and paths
160-
NDArrayCacheMetadata ndarray_cache_metadata = NDArrayCacheMetadata::Load(model_path);
160+
TensorCacheMetadata tensor_cache_metadata = TensorCacheMetadata::Load(model_path);
161161
picojson::value model_config;
162162
picojson::parse(model_config, model_config_str);
163163
ModelMetadata model_metadata =
@@ -175,14 +175,14 @@ Array<Optional<NDArray>> LoadMultiGPU(const std::string& model_path, Module vm_m
175175
param_name2info[param.name] = param;
176176
}
177177
// Step 2. Load, preprocess and shard all the parameters
178-
std::unordered_map<std::string, NDArray> sharded_params;
178+
std::unordered_map<std::string, Tensor> sharded_params;
179179
if (worker_id == 0) {
180180
DurationType time_loading(0);
181181
DurationType time_preproc(0);
182182
ProgressBar progress_bar(model_metadata.params.size());
183183
LOG(INFO) << "Loading parameters...";
184-
for (const NDArrayCacheMetadata::FileRecord& record : ndarray_cache_metadata.records) {
185-
Array<NDArray> loaded_params;
184+
for (const TensorCacheMetadata::FileRecord& record : tensor_cache_metadata.records) {
185+
Array<Tensor> loaded_params;
186186
{
187187
RangeTimer _(&time_loading);
188188
std::string raw_data_buffer;
@@ -212,7 +212,7 @@ Array<Optional<NDArray>> LoadMultiGPU(const std::string& model_path, Module vm_m
212212
<< " Loading " << FormatDuration(time_loading) << " Preprocessing "
213213
<< FormatDuration(time_preproc) << ".";
214214
} else {
215-
for (const NDArrayCacheMetadata::FileRecord& record : ndarray_cache_metadata.records) {
215+
for (const TensorCacheMetadata::FileRecord& record : tensor_cache_metadata.records) {
216216
for (size_t i = 0; i < record.records.size(); ++i) {
217217
const std::string& param_name = record.records[i].name;
218218
const ModelMetadata::Param& param_info = param_name2info.at(param_name);
@@ -225,7 +225,7 @@ Array<Optional<NDArray>> LoadMultiGPU(const std::string& model_path, Module vm_m
225225
if (worker_id % group_size == 0) {
226226
// The worker is the first worker of its worker group (while not the first worker group).
227227
// Receive the full parameter from the global worker 0.
228-
NDArray full_param = RecvFromGlobalWorker0(device, param_info);
228+
Tensor full_param = RecvFromGlobalWorker0(device, param_info);
229229
// Broadcast or shard-scatter this parameter to all workers in its worker group.
230230
sharded_params[param_name] =
231231
BroadcastOrShardAndScatter(full_param, param_info, num_shards, preprocs);
@@ -239,17 +239,17 @@ Array<Optional<NDArray>> LoadMultiGPU(const std::string& model_path, Module vm_m
239239
}
240240

241241
// Step 3. Reorder the sharded parameters according to the order in model_metadata
242-
Array<Optional<NDArray>> shards;
242+
Array<Optional<Tensor>> shards;
243243
shards.reserve(model_metadata.params.size());
244244
for (const ModelMetadata::Param& param : model_metadata.params) {
245245
const auto& it = sharded_params.find(param.name);
246-
shards.push_back(it == sharded_params.end() ? Optional<NDArray>() : it->second);
246+
shards.push_back(it == sharded_params.end() ? Optional<Tensor>() : it->second);
247247
}
248248
return shards;
249249
}
250250

251-
Array<Optional<NDArray>> LoadMultiGPUPresharded(const std::string& model_path, Module vm_module,
252-
const std::string& model_config_str) {
251+
Array<Optional<Tensor>> LoadMultiGPUPresharded(const std::string& model_path, Module vm_module,
252+
const std::string& model_config_str) {
253253
DiscoWorker* worker = DiscoWorker::ThreadLocal();
254254
Device device = worker->default_device;
255255
int worker_id = worker->worker_id;
@@ -259,22 +259,22 @@ Array<Optional<NDArray>> LoadMultiGPUPresharded(const std::string& model_path, M
259259
int local_worker_id = worker_id % group_size;
260260
LOG(INFO) << "[Worker #" << worker_id << "] Loading model to device: " << device;
261261
// Step 0. Initialize metadata and paths
262-
NDArrayCacheMetadata ndarray_cache_metadata = NDArrayCacheMetadata::Load(model_path);
262+
TensorCacheMetadata tensor_cache_metadata = TensorCacheMetadata::Load(model_path);
263263
picojson::value model_config;
264264
picojson::parse(model_config, model_config_str);
265265
ModelMetadata model_metadata =
266266
ModelMetadata::FromModule(vm_module, model_config.get<picojson::object>());
267267

268268
std::unordered_map<std::string, ParamInfo> param_info_map;
269-
for (const NDArrayCacheMetadata::FileRecord& file_record : ndarray_cache_metadata.records) {
270-
for (const NDArrayCacheMetadata::FileRecord::ParamRecord& param_record : file_record.records) {
269+
for (const TensorCacheMetadata::FileRecord& file_record : tensor_cache_metadata.records) {
270+
for (const TensorCacheMetadata::FileRecord::ParamRecord& param_record : file_record.records) {
271271
const std::string& param_name = param_record.name;
272272
param_info_map[param_name] = ParamInfo{&file_record, &param_record};
273273
}
274274
}
275275

276-
Array<Optional<NDArray>> params;
277-
const NDArrayCacheMetadata::FileRecord* current_file_;
276+
Array<Optional<Tensor>> params;
277+
const TensorCacheMetadata::FileRecord* current_file_;
278278
std::string current_file_stream_;
279279
params.reserve(model_metadata.params.size());
280280
DurationType time_loading(0);
@@ -283,7 +283,7 @@ Array<Optional<NDArray>> LoadMultiGPUPresharded(const std::string& model_path, M
283283
if (std::find(param.pipeline_stages.begin(), param.pipeline_stages.end(), group_id) ==
284284
param.pipeline_stages.end()) {
285285
// This worker group doesn't need to hold a copy of this parameter.
286-
params.push_back(Optional<NDArray>());
286+
params.push_back(Optional<Tensor>());
287287
continue;
288288
}
289289
bool needs_sharding = !param.preprocs.empty();
@@ -295,8 +295,8 @@ Array<Optional<NDArray>> LoadMultiGPUPresharded(const std::string& model_path, M
295295
auto it = param_info_map.find(param_name);
296296
CHECK(it != param_info_map.end()) << "ValueError: Cannot find parameter: " << param_name;
297297
const ParamInfo& param_info = (*it).second;
298-
const NDArrayCacheMetadata::FileRecord::ParamRecord* param_record = param_info.param;
299-
const NDArrayCacheMetadata::FileRecord* file_record = param_info.file;
298+
const TensorCacheMetadata::FileRecord::ParamRecord* param_record = param_info.param;
299+
const TensorCacheMetadata::FileRecord* file_record = param_info.file;
300300

301301
if (file_record != current_file_) {
302302
current_file_ = file_record;

0 commit comments

Comments
 (0)