mlc-ai
diff --git a/‎3rdparty/tvm b/‎3rdparty/tvm
diff --git a/‎android/MLCChat/app/src/main/java/ai/mlc/mlcchat/AppViewModel.kt
Lines changed: 1 addition & 1 deletion b/‎android/MLCChat/app/src/main/java/ai/mlc/mlcchat/AppViewModel.kt
Lines changed: 1 addition & 1 deletion
diff --git a/‎android/mlc4j/src/cpp/tvm_runtime.h
Lines changed: 2 additions & 2 deletions b/‎android/mlc4j/src/cpp/tvm_runtime.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/json_ffi/conv_template.cc
Lines changed: 5 additions & 5 deletions b/‎cpp/json_ffi/conv_template.cc
Lines changed: 5 additions & 5 deletions
diff --git a/‎cpp/json_ffi/image_utils.cc
Lines changed: 11 additions & 11 deletions b/‎cpp/json_ffi/image_utils.cc
Lines changed: 11 additions & 11 deletions
diff --git a/‎cpp/json_ffi/image_utils.h
Lines changed: 6 additions & 6 deletions b/‎cpp/json_ffi/image_utils.h
Lines changed: 6 additions & 6 deletions
diff --git a/‎cpp/multi_gpu/builtin.cc
Lines changed: 7 additions & 7 deletions b/‎cpp/multi_gpu/builtin.cc
Lines changed: 7 additions & 7 deletions
diff --git a/‎cpp/multi_gpu/multi_gpu_loader.cc
Lines changed: 37 additions & 37 deletions b/‎cpp/multi_gpu/multi_gpu_loader.cc
Lines changed: 37 additions & 37 deletions
@@ -51,7 +51,7 @@ class AppViewModel(application: Application) : AndroidViewModel(application) {
     companion object {
         const val AppConfigFilename = "mlc-app-config.json"
         const val ModelConfigFilename = "mlc-chat-config.json"
-        const val ParamsConfigFilename = "ndarray-cache.json"
+        const val ParamsConfigFilename = "tensor-cache.json"
         const val ModelUrlSuffix = "resolve/main/"
     }
 
 
@@ -23,23 +23,23 @@
 #include <runtime/logging.cc>
 #include <runtime/memory/memory_manager.cc>
 #include <runtime/module.cc>
-#include <runtime/ndarray.cc>
 #include <runtime/nvtx.cc>
 #include <runtime/opencl/opencl_device_api.cc>
 #include <runtime/opencl/opencl_module.cc>
 #include <runtime/opencl/opencl_wrapper/opencl_wrapper.cc>
 #include <runtime/profiling.cc>
 #include <runtime/source_utils.cc>
+#include <runtime/tensor.cc>
 #include <runtime/thread_pool.cc>
 #include <runtime/threading_backend.cc>
 #include <runtime/vm/attn_backend.cc>
 #include <runtime/vm/builtin.cc>
 #include <runtime/vm/bytecode.cc>
 #include <runtime/vm/executable.cc>
 #include <runtime/vm/kv_state.cc>
-#include <runtime/vm/ndarray_cache_support.cc>
 #include <runtime/vm/paged_kv_cache.cc>
 #include <runtime/vm/rnn_state.cc>
+#include <runtime/vm/tensor_cache_support.cc>
 #include <runtime/vm/vm.cc>
 #include <runtime/workspace_pool.cc>
 
 
@@ -314,7 +314,7 @@ Result<std::vector<Data>> CreatePrompt(const Conversation& conv,
                                        // should be a map, with a "url" key containing the URL, but
                                        // we are just assuming this as the URL for now
             std::string base64_image = image_url.substr(image_url.find(",") + 1);
-            Result<NDArray> image_data_res = LoadImageFromBase64(base64_image);
+            Result<Tensor> image_data_res = LoadImageFromBase64(base64_image);
             if (image_data_res.IsErr()) {
               return TResult::Error(image_data_res.UnwrapErr());
             }
@@ -326,18 +326,18 @@ Result<std::vector<Data>> CreatePrompt(const Conversation& conv,
 
             int embed_size = (image_size * image_size) / (patch_size * patch_size);
 
-            NDArray image_data = image_data_res.Unwrap();
+            Tensor image_data = image_data_res.Unwrap();
             std::vector<int64_t> new_shape = {1, image_size, image_size, 3};
-            NDArray image_ndarray = image_data.CreateView(new_shape, image_data.DataType());
+            Tensor image_tensor = image_data.CreateView(new_shape, image_data.DataType());
             // TODO: Not sure if commenting will affect other functions. But
-            // python part will do clip preprocessing. auto image_ndarray =
+            // python part will do clip preprocessing. auto image_tensor =
             // ClipPreprocessor(image_data_res.Unwrap(), image_size, device);
             // lazily commit text data
             if (pending_text.length() != 0) {
               message_list.push_back(TextData(pending_text));
               pending_text = "";
             }
-            message_list.push_back(ImageData(image_ndarray, embed_size));
+            message_list.push_back(ImageData(image_tensor, embed_size));
           } else {
             return TResult::Error("Unsupported content type: " + it_type->second);
           }
 
@@ -51,8 +51,8 @@ size_t Base64DecodedSize(const std::string& base64_str) {
   return 3 * len / 4 - padding;
 }
 
-Result<NDArray> LoadImageFromBase64(const std::string& base64_str) {
-  using TResult = Result<NDArray>;
+Result<Tensor> LoadImageFromBase64(const std::string& base64_str) {
+  using TResult = Result<Tensor>;
   MemoryBufferStream stream(base64_str.c_str(), base64_str.size());
   tvm::support::Base64InStream base64_stream(&stream);
   size_t decoded_size = Base64DecodedSize(base64_str);
@@ -65,13 +65,13 @@ Result<NDArray> LoadImageFromBase64(const std::string& base64_str) {
   if (!image_data) {
     return TResult::Error(stbi_failure_reason());
   }
-  auto image_ndarray = NDArray::Empty({height, width, 3}, {kDLUInt, 8, 1}, {kDLCPU, 0});
-  image_ndarray.CopyFromBytes((void*)image_data, width * height * 3);
+  auto image_tensor = Tensor::Empty({height, width, 3}, {kDLUInt, 8, 1}, {kDLCPU, 0});
+  image_tensor.CopyFromBytes((void*)image_data, width * height * 3);
   stbi_image_free(image_data);
-  return TResult::Ok(image_ndarray);
+  return TResult::Ok(image_tensor);
 }
 
-NDArray ClipPreprocessor(NDArray image_data, int target_size, DLDevice device) {
+Tensor ClipPreprocessor(Tensor image_data, int target_size, DLDevice device) {
   int height = image_data->shape[0];
   int width = image_data->shape[1];
   // Resize
@@ -143,12 +143,12 @@ NDArray ClipPreprocessor(NDArray image_data, int target_size, DLDevice device) {
     }
   }
 
-  // Create NDArray
-  auto image_ndarray = NDArray::Empty({1, 3, target_size, target_size}, {kDLFloat, 32, 1}, device);
-  image_ndarray.CopyFromBytes((void*)image_data_channel_first.data(),
-                              target_size * target_size * 3 * sizeof(float));
+  // Create Tensor
+  auto image_tensor = Tensor::Empty({1, 3, target_size, target_size}, {kDLFloat, 32, 1}, device);
+  image_tensor.CopyFromBytes((void*)image_data_channel_first.data(),
+                             target_size * target_size * 3 * sizeof(float));
 
-  return image_ndarray;
+  return image_tensor;
 }
 
 }  // namespace json_ffi
 
@@ -6,7 +6,7 @@
 #ifndef MLC_LLM_JSON_FFI_IMAGE_UTILS_H_
 #define MLC_LLM_JSON_FFI_IMAGE_UTILS_H_
 
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 
 #include <optional>
 #include <string>
@@ -17,12 +17,12 @@ namespace mlc {
 namespace llm {
 namespace json_ffi {
 
-/*! \brief Load a base64 encoded image string into a CPU NDArray of shape {height, width, 3} */
-Result<tvm::runtime::NDArray> LoadImageFromBase64(const std::string& base64_str);
+/*! \brief Load a base64 encoded image string into a CPU Tensor of shape {height, width, 3} */
+Result<tvm::runtime::Tensor> LoadImageFromBase64(const std::string& base64_str);
 
-/*! \brief Preprocess the CPU image for CLIP encoder and return an NDArray on the given device */
-tvm::runtime::NDArray ClipPreprocessor(tvm::runtime::NDArray image_data, int target_size,
-                                       DLDevice device);
+/*! \brief Preprocess the CPU image for CLIP encoder and return an Tensor on the given device */
+tvm::runtime::Tensor ClipPreprocessor(tvm::runtime::Tensor image_data, int target_size,
+                                      DLDevice device);
 
 }  // namespace json_ffi
 }  // namespace llm
 
@@ -12,7 +12,7 @@
 #include <tvm/node/cast.h>
 #include <tvm/runtime/disco/builtin.h>
 #include <tvm/runtime/disco/disco_worker.h>
-#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/tensor.h>
 #include <tvm/runtime/vm/vm.h>
 
 namespace mlc {
@@ -55,7 +55,7 @@ ObjectRef DispatchFunctionByGroup(tvm::ffi::AnyView vm_arg,
   return rv.cast<ObjectRef>();
 }
 
-ObjectRef SendFromLastGroupToWorker0(NDArray send, Optional<NDArray> recv, Shape shape,
+ObjectRef SendFromLastGroupToWorker0(Tensor send, Optional<Tensor> recv, Shape shape,
                                      DataType dtype) {
   DiscoWorker* worker = DiscoWorker::ThreadLocal();
   int worker_id = worker->worker_id;
@@ -64,18 +64,18 @@ ObjectRef SendFromLastGroupToWorker0(NDArray send, Optional<NDArray> recv, Shape
   CHECK_NE(world_size, group_size) << "Cannot perform when there is only one group.";
   int sender_id = world_size - group_size;
   if (worker_id == 0) {
-    CHECK(recv.defined()) << "The receive NDArray is undefined for worker 0.";
-    NDArray recv_arr = recv.value().CreateView(shape, dtype);
+    CHECK(recv.defined()) << "The receive Tensor is undefined for worker 0.";
+    Tensor recv_arr = recv.value().CreateView(shape, dtype);
     RecvFromWorker(recv_arr, sender_id);
     return recv_arr;
   } else if (worker_id == sender_id) {
     CHECK_EQ(DataType(send->dtype), dtype)
-        << "The src NDArray has mismatched dtype than the expected dtype.";
+        << "The src Tensor has mismatched dtype than the expected dtype.";
     CHECK_EQ(send->ndim, shape.size())
-        << "The src NDArray has mismatched shape than the expected shape.";
+        << "The src Tensor has mismatched shape than the expected shape.";
     for (int i = 0; i < send->ndim; ++i) {
       CHECK_EQ(send->shape[i], shape[i])
-          << "The src NDArray has mismatched shape than the expected shape.";
+          << "The src Tensor has mismatched shape than the expected shape.";
     }
     SendToWorker(send, /*receiver_id=*/0);
     return recv;
 
@@ -11,7 +11,7 @@
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/disco/builtin.h>
 #include <tvm/runtime/disco/disco_worker.h>
-#include <tvm/runtime/vm/ndarray_cache_support.h>
+#include <tvm/runtime/vm/tensor_cache_support.h>
 
 #include <chrono>
 #include <filesystem>
@@ -31,7 +31,7 @@ namespace llm {
 namespace multi_gpu {
 
 using tvm::Device;
-using tvm::runtime::vm::NDArrayCacheMetadata;
+using tvm::runtime::vm::TensorCacheMetadata;
 using namespace tvm::runtime;
 using tvm::ffi::Array;
 using tvm::ffi::Function;
@@ -76,11 +76,11 @@ class PreprocessorPool {
     }
   }
 
-  NDArray Apply(NDArray param, const ModelMetadata::Param& param_info) const {
+  Tensor Apply(Tensor param, const ModelMetadata::Param& param_info) const {
     for (const ModelMetadata::Param::Preproc& preproc : param_info.preprocs) {
       const std::string& func_name = preproc.func_name;
-      NDArray param_in = param;
-      param = NDArray::Empty(preproc.out_shape, preproc.out_dtype, param->device);
+      Tensor param_in = param;
+      param = Tensor::Empty(preproc.out_shape, preproc.out_dtype, param->device);
       ICHECK(preproc_funcs.count(func_name));
       DLTensor dl_param_in = *param_in.operator->();
       DLTensor dl_param = *param.operator->();
@@ -94,19 +94,19 @@ class PreprocessorPool {
 };
 
 struct ParamInfo {
-  const NDArrayCacheMetadata::FileRecord* file;
-  const NDArrayCacheMetadata::FileRecord::ParamRecord* param;
+  const TensorCacheMetadata::FileRecord* file;
+  const TensorCacheMetadata::FileRecord::ParamRecord* param;
 };
 
-NDArray RecvFromGlobalWorker0(Device device, const ModelMetadata::Param& param_info) {
+Tensor RecvFromGlobalWorker0(Device device, const ModelMetadata::Param& param_info) {
   Shape shape = param_info.preprocs.empty() ? param_info.shape : param_info.preprocs[0].in_shape;
-  NDArray result = NDArray::Empty(shape, param_info.dtype, device);
+  Tensor result = Tensor::Empty(shape, param_info.dtype, device);
   RecvFromWorker0(result);
   return result;
 }
 
-NDArray BroadcastOrShardAndScatter(NDArray param, const ModelMetadata::Param& param_info,
-                                   int num_shards, const PreprocessorPool& preprocs) {
+Tensor BroadcastOrShardAndScatter(Tensor param, const ModelMetadata::Param& param_info,
+                                  int num_shards, const PreprocessorPool& preprocs) {
   bool needs_sharding = !param_info.preprocs.empty();
   if (!needs_sharding) {
     BroadcastFromWorker0(param, /*in_group=*/true, param);
@@ -119,22 +119,22 @@ NDArray BroadcastOrShardAndScatter(NDArray param, const ModelMetadata::Param& pa
       << "ValueError: The first dimension of the output shape must be equal to the "
       << "number of shards, but got: " << shape << " and num_shards = " << num_shards;
   param = preprocs.Apply(param, param_info);
-  NDArray result = NDArray::Empty(Shape(shape.begin() + 1, shape.end()), dtype, device);
+  Tensor result = Tensor::Empty(Shape(shape.begin() + 1, shape.end()), dtype, device);
   ScatterFromWorker0(param, /*in_group=*/true, result);
   return result;
 }
 
-NDArray ReceiveBroadcastedOrSharded(Device device, const ModelMetadata::Param& param_info,
-                                    int num_shards) {
+Tensor ReceiveBroadcastedOrSharded(Device device, const ModelMetadata::Param& param_info,
+                                   int num_shards) {
   bool needs_sharding = !param_info.preprocs.empty();
-  NDArray result;
+  Tensor result;
   if (needs_sharding) {
     Shape shape = param_info.preprocs.back().out_shape;
     DataType dtype = param_info.preprocs.back().out_dtype;
-    result = NDArray::Empty(Shape(shape.begin() + 1, shape.end()), dtype, device);
+    result = Tensor::Empty(Shape(shape.begin() + 1, shape.end()), dtype, device);
     ScatterFromWorker0(std::nullopt, /*in_group=*/true, result);
   } else {
-    result = NDArray::Empty(param_info.shape, param_info.dtype, device);
+    result = Tensor::Empty(param_info.shape, param_info.dtype, device);
     BroadcastFromWorker0(result, /*in_group=*/true, result);
   }
   return result;
@@ -147,8 +147,8 @@ std::string FormatDuration(DurationType duration) {
   return os.str();
 }
 
-Array<Optional<NDArray>> LoadMultiGPU(const std::string& model_path, Module vm_module,
-                                      const std::string& model_config_str) {
+Array<Optional<Tensor>> LoadMultiGPU(const std::string& model_path, Module vm_module,
+                                     const std::string& model_config_str) {
   DiscoWorker* worker = DiscoWorker::ThreadLocal();
   Device device = worker->default_device;
   int worker_id = worker->worker_id;
@@ -157,7 +157,7 @@ Array<Optional<NDArray>> LoadMultiGPU(const std::string& model_path, Module vm_m
   int group_id = worker_id / group_size;
   LOG(INFO) << "[Worker #" << worker_id << "] Loading model to device: " << device;
   // Step 0. Initialize metadata and paths
-  NDArrayCacheMetadata ndarray_cache_metadata = NDArrayCacheMetadata::Load(model_path);
+  TensorCacheMetadata tensor_cache_metadata = TensorCacheMetadata::Load(model_path);
   picojson::value model_config;
   picojson::parse(model_config, model_config_str);
   ModelMetadata model_metadata =
@@ -175,14 +175,14 @@ Array<Optional<NDArray>> LoadMultiGPU(const std::string& model_path, Module vm_m
     param_name2info[param.name] = param;
   }
   // Step 2. Load, preprocess and shard all the parameters
-  std::unordered_map<std::string, NDArray> sharded_params;
+  std::unordered_map<std::string, Tensor> sharded_params;
   if (worker_id == 0) {
     DurationType time_loading(0);
     DurationType time_preproc(0);
     ProgressBar progress_bar(model_metadata.params.size());
     LOG(INFO) << "Loading parameters...";
-    for (const NDArrayCacheMetadata::FileRecord& record : ndarray_cache_metadata.records) {
-      Array<NDArray> loaded_params;
+    for (const TensorCacheMetadata::FileRecord& record : tensor_cache_metadata.records) {
+      Array<Tensor> loaded_params;
       {
         RangeTimer _(&time_loading);
         std::string raw_data_buffer;
@@ -212,7 +212,7 @@ Array<Optional<NDArray>> LoadMultiGPU(const std::string& model_path, Module vm_m
               << " Loading " << FormatDuration(time_loading) << " Preprocessing "
               << FormatDuration(time_preproc) << ".";
   } else {
-    for (const NDArrayCacheMetadata::FileRecord& record : ndarray_cache_metadata.records) {
+    for (const TensorCacheMetadata::FileRecord& record : tensor_cache_metadata.records) {
       for (size_t i = 0; i < record.records.size(); ++i) {
         const std::string& param_name = record.records[i].name;
         const ModelMetadata::Param& param_info = param_name2info.at(param_name);
@@ -225,7 +225,7 @@ Array<Optional<NDArray>> LoadMultiGPU(const std::string& model_path, Module vm_m
         if (worker_id % group_size == 0) {
           // The worker is the first worker of its worker group (while not the first worker group).
           // Receive the full parameter from the global worker 0.
-          NDArray full_param = RecvFromGlobalWorker0(device, param_info);
+          Tensor full_param = RecvFromGlobalWorker0(device, param_info);
           // Broadcast or shard-scatter this parameter to all workers in its worker group.
           sharded_params[param_name] =
               BroadcastOrShardAndScatter(full_param, param_info, num_shards, preprocs);
@@ -239,17 +239,17 @@ Array<Optional<NDArray>> LoadMultiGPU(const std::string& model_path, Module vm_m
   }
 
   // Step 3. Reorder the sharded parameters according to the order in model_metadata
-  Array<Optional<NDArray>> shards;
+  Array<Optional<Tensor>> shards;
   shards.reserve(model_metadata.params.size());
   for (const ModelMetadata::Param& param : model_metadata.params) {
     const auto& it = sharded_params.find(param.name);
-    shards.push_back(it == sharded_params.end() ? Optional<NDArray>() : it->second);
+    shards.push_back(it == sharded_params.end() ? Optional<Tensor>() : it->second);
   }
   return shards;
 }
 
-Array<Optional<NDArray>> LoadMultiGPUPresharded(const std::string& model_path, Module vm_module,
-                                                const std::string& model_config_str) {
+Array<Optional<Tensor>> LoadMultiGPUPresharded(const std::string& model_path, Module vm_module,
+                                               const std::string& model_config_str) {
   DiscoWorker* worker = DiscoWorker::ThreadLocal();
   Device device = worker->default_device;
   int worker_id = worker->worker_id;
@@ -259,22 +259,22 @@ Array<Optional<NDArray>> LoadMultiGPUPresharded(const std::string& model_path, M
   int local_worker_id = worker_id % group_size;
   LOG(INFO) << "[Worker #" << worker_id << "] Loading model to device: " << device;
   // Step 0. Initialize metadata and paths
-  NDArrayCacheMetadata ndarray_cache_metadata = NDArrayCacheMetadata::Load(model_path);
+  TensorCacheMetadata tensor_cache_metadata = TensorCacheMetadata::Load(model_path);
   picojson::value model_config;
   picojson::parse(model_config, model_config_str);
   ModelMetadata model_metadata =
       ModelMetadata::FromModule(vm_module, model_config.get<picojson::object>());
 
   std::unordered_map<std::string, ParamInfo> param_info_map;
-  for (const NDArrayCacheMetadata::FileRecord& file_record : ndarray_cache_metadata.records) {
-    for (const NDArrayCacheMetadata::FileRecord::ParamRecord& param_record : file_record.records) {
+  for (const TensorCacheMetadata::FileRecord& file_record : tensor_cache_metadata.records) {
+    for (const TensorCacheMetadata::FileRecord::ParamRecord& param_record : file_record.records) {
       const std::string& param_name = param_record.name;
       param_info_map[param_name] = ParamInfo{&file_record, &param_record};
     }
   }
 
-  Array<Optional<NDArray>> params;
-  const NDArrayCacheMetadata::FileRecord* current_file_;
+  Array<Optional<Tensor>> params;
+  const TensorCacheMetadata::FileRecord* current_file_;
   std::string current_file_stream_;
   params.reserve(model_metadata.params.size());
   DurationType time_loading(0);
@@ -283,7 +283,7 @@ Array<Optional<NDArray>> LoadMultiGPUPresharded(const std::string& model_path, M
     if (std::find(param.pipeline_stages.begin(), param.pipeline_stages.end(), group_id) ==
         param.pipeline_stages.end()) {
       // This worker group doesn't need to hold a copy of this parameter.
-      params.push_back(Optional<NDArray>());
+      params.push_back(Optional<Tensor>());
       continue;
     }
     bool needs_sharding = !param.preprocs.empty();
@@ -295,8 +295,8 @@ Array<Optional<NDArray>> LoadMultiGPUPresharded(const std::string& model_path, M
     auto it = param_info_map.find(param_name);
     CHECK(it != param_info_map.end()) << "ValueError: Cannot find parameter: " << param_name;
     const ParamInfo& param_info = (*it).second;
-    const NDArrayCacheMetadata::FileRecord::ParamRecord* param_record = param_info.param;
-    const NDArrayCacheMetadata::FileRecord* file_record = param_info.file;
+    const TensorCacheMetadata::FileRecord::ParamRecord* param_record = param_info.param;
+    const TensorCacheMetadata::FileRecord* file_record = param_info.file;
 
     if (file_record != current_file_) {
       current_file_ = file_record;
Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@ class AppViewModel(application: Application) : AndroidViewModel(application) {`
`51`	`51`	`companion object {`
`52`	`52`	`const val AppConfigFilename = "mlc-app-config.json"`
`53`	`53`	`const val ModelConfigFilename = "mlc-chat-config.json"`
`54`		`- const val ParamsConfigFilename = "ndarray-cache.json"`
	`54`	`+ const val ParamsConfigFilename = "tensor-cache.json"`
`55`	`55`	`const val ModelUrlSuffix = "resolve/main/"`
`56`	`56`	`}`
`57`	`57`