PaddlePaddle
diff --git a/‎build.sh‎
Lines changed: 40 additions & 19 deletions b/‎build.sh‎
Lines changed: 40 additions & 19 deletions
diff --git a/‎cinn/backends/codegen_cuda_dev_test.cc‎
100644100755
Lines changed: 7 additions & 7 deletions b/‎cinn/backends/codegen_cuda_dev_test.cc‎
100644100755
Lines changed: 7 additions & 7 deletions
diff --git a/‎cinn/backends/compiler.cc‎
100644100755
Lines changed: 3 additions & 3 deletions b/‎cinn/backends/compiler.cc‎
100644100755
Lines changed: 3 additions & 3 deletions
diff --git a/‎cinn/backends/llvm/execution_engine.cc‎
100644100755
Lines changed: 5 additions & 5 deletions b/‎cinn/backends/llvm/execution_engine.cc‎
100644100755
Lines changed: 5 additions & 5 deletions
diff --git a/‎cinn/backends/llvm/simple_jit.cc‎
100644100755
Lines changed: 3 additions & 3 deletions b/‎cinn/backends/llvm/simple_jit.cc‎
100644100755
Lines changed: 3 additions & 3 deletions
diff --git a/‎cinn/backends/llvm/simple_jit.h‎
100644100755 b/‎cinn/backends/llvm/simple_jit.h‎
100644100755
diff --git a/‎cinn/backends/nvrtc_util.cc‎
100644100755
Lines changed: 1 addition & 1 deletion b/‎cinn/backends/nvrtc_util.cc‎
100644100755
Lines changed: 1 addition & 1 deletion
diff --git a/‎cinn/common/cas.cc‎
Lines changed: 2 additions & 2 deletions b/‎cinn/common/cas.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cinn/common/ir_util.cc‎
Lines changed: 1 addition & 1 deletion b/‎cinn/common/ir_util.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cinn/frontend/computation.cc‎
100644100755
Lines changed: 1 addition & 1 deletion b/‎cinn/frontend/computation.cc‎
100644100755
Lines changed: 1 addition & 1 deletion
@@ -42,6 +42,17 @@ function gpu_on {
   cudnn_config=ON
 }
 
+function test_doc {
+    mkdir -p $build_dir
+    cd $build_dir
+    export runtime_include_dir=$workspace/cinn/runtime/cuda
+
+    prepare_ci
+    cmake_
+    build
+    make_doc
+}
+
 function cudnn_off {
   cudnn_config=OFF
 }
@@ -94,36 +105,46 @@ function prepare_ci {
   pip install pre-commit
   pip install clang-format==9.0
   pip install wheel
-  pip install sphinx==3.3.1 sphinx_gallery==0.8.1 recommonmark==0.6.0 exhale scipy breathe==4.24.0 matplotlib
+  pip install sphinx==3.3.1 sphinx_gallery==0.8.1 recommonmark==0.6.0 exhale scipy breathe==4.24.0 matplotlib sphinx_rtd_theme
   pip install paddlepaddle-gpu==2.1.2.post101 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
 }
 
-function make_doc {
+function prepare_doc_model_file {
     proxy_off
-    cd $workspace/tutorials
-    if [[ -f "ResNet18.tar.gz" ]]; then
-        echo "model file for tutorials already downloaded."
-    elif [[ -f "$build_dir/thirds/ResNet18.tar.gz" ]]; then
-        rm -rf $workspace/tutorials/ResNet18
-        ln -s $build_dir/thirds/ResNet18 $workspace/tutorials/ResNet18
+    local tar_file=$1
+    if [[ -f "$tar_file.tar.gz" ]]; then
+        echo "model file $tar_file.tar.gz for tutorials already downloaded."
+    elif [[ -f "$build_dir/thirds/$tar_file.tar.gz" ]]; then
+        rm -rf $workspace/tutorials/$tar_file
+        ln -s $build_dir/thirds/$tar_file $workspace/tutorials/$tar_file
     else
-        wget http://paddle-inference-dist.bj.bcebos.com/CINN/ResNet18.tar.gz
-        tar -zxvf ResNet18.tar.gz
+        wget https://paddle-inference-dist.bj.bcebos.com/CINN/$tar_file.tar.gz
+        tar -zxvf $tar_file.tar.gz
     fi
+}
+
+function make_doc {
+    proxy_off
+    cd $workspace/tutorials
+    prepare_doc_model_file ResNet50
+    prepare_doc_model_file MobileNetV2
+    prepare_doc_model_file EfficientNet
+    prepare_doc_model_file FaceDet
+
     if [[ $cuda_config == "ON" && ! -d "./is_cuda" ]]; then
         mkdir is_cuda
     fi
-
+    if [[ $cuda_config == "OFF" && -d "./is_cuda" ]]; then
+        rm -rf ./is_cuda
+    fi
     cd $build_dir
     rm -f $workspace/python/cinn/core_api.so
     ln -s $build_dir/cinn/pybind/core_api.so $workspace/python/cinn/
     cd $workspace/docs
     mkdir -p docs/source/cpp
-    cat $workspace/tutorials/matmul.cc | python${py_version} $workspace/tools/gen_c++_tutorial.py  > $workspace/docs/source/matmul.md
+    cat $workspace/tutorials/matmul.cc | python${py_version} $workspace/tools/gen_c++_tutorial.py > $workspace/docs/source/matmul.md
+    cat $workspace/tutorials/load_paddle_model.cc | python${py_version} $workspace/tools/gen_c++_tutorial.py > $workspace/docs/source/load_paddle_model.md
     make html
-    if [[ $cuda_config == "ON" && -d "./is_cuda" ]]; then
-        rm -rf $workspace/tutorials/is_cuda
-    fi
 }
 
 function cmake_ {
@@ -308,6 +329,10 @@ function main {
                 run_test
                 shift
                 ;;
+            test_doc)
+                test_doc
+                shift
+                ;;
             ci)
                 CI
                 shift
@@ -320,10 +345,6 @@ function main {
                 prepare_model
                 shift
                 ;;
-            make_doc)
-                make_doc
-                shift
-                ;;
         esac
     done
 }
 
@@ -89,7 +89,7 @@ TEST(CodeGenCUDA, basic) {
 
   CodeGenCUDA_Dev codegen(target);
 
-  auto func = Lower("elementwise_add", stages, {A, B, C});
+  auto func = Lower("elementwise_mul", stages, {A, B, C});
 
   auto compiled = codegen.Compile(func);
 
@@ -115,7 +115,7 @@ TEST(CodeGenCUDA, Module_output) {
 
   CodeGenCUDA_Dev codegen(target);
 
-  auto func = Lower("elementwise_add", stages, {A, B, C});
+  auto func = Lower("elementwise_mul", stages, {A, B, C});
 
   Module::Builder builder("module", target);
   builder.AddFunction(func);
@@ -149,7 +149,7 @@ TEST(CodeGenCUDA2, test_of_cacheread) {
   stages[B_cache]->ComputeAt(stages[C], 1);
   CodeGenCUDA_Dev codegen(target);
 
-  auto func = Lower("elementwise_add", stages, {A, B, C});
+  auto func = Lower("elementwise_mul", stages, {A, B, C});
 
   Module::Builder builder("module", target);
   builder.AddFunction(func);
@@ -181,7 +181,7 @@ TEST(CodeGenCUDA2, test_of_cacheread) {
 
   dim3 grid(10, 1, 1);
   dim3 block(10, 1, 1);
-  cuda_module.LaunchKernel(0, "elementwise_add", grid, block, args);
+  cuda_module.LaunchKernel(0, "elementwise_mul", grid, block, args);
 
   CUDA_CALL(cudaMemcpy(host_data3.data(),
                        reinterpret_cast<void*>(Cd),
@@ -221,7 +221,7 @@ TEST(CodeGenCUDA2, test_of_splitcudakernel) {
 
   CodeGenCUDA_Dev codegen(target);
 
-  auto func = lang::LowerVec("elementwise_add", stages, {A, B, C, D}, {}, {}, nullptr, target);
+  auto func = lang::LowerVec("elementwise_mul_and_add", stages, {A, B, C, D}, {}, {}, nullptr, target);
 
   Module::Builder builder("module", target);
   for (auto& i : func) {
@@ -251,15 +251,15 @@ typedef char int8_t;
 
 
 __global__
-void __launch_bounds__(200) elementwise_add(const float* __restrict__ X, const float* __restrict__ Y, float* __restrict__ C)
+void __launch_bounds__(200) elementwise_mul_and_add(const float* __restrict__ X, const float* __restrict__ Y, float* __restrict__ C)
 {
   if (((int)blockIdx.x < 100)) {
     if (((int)threadIdx.x < 200)) {
       C[((200 * (int)blockIdx.x) + (int)threadIdx.x)] = (X[((200 * (int)blockIdx.x) + (int)threadIdx.x)] * Y[((200 * (int)blockIdx.x) + (int)threadIdx.x)]);
     };
   };
 }__global__
-void __launch_bounds__(200) elementwise_add_1(const float* __restrict__ X, const float* __restrict__ Y, const float* __restrict__ C, float* __restrict__ D)
+void __launch_bounds__(200) elementwise_mul_and_add_1(const float* __restrict__ X, const float* __restrict__ Y, const float* __restrict__ C, float* __restrict__ D)
 {
   if (((int)blockIdx.x < 100)) {
     if (((int)threadIdx.x < 200)) {
 
@@ -70,14 +70,14 @@ void Compiler::CompileCudaModule(const Module& module, const std::string& code,
   auto _host_module_device_module_ = SplitCudaAndHostModule(module);  // NOLINT
   auto& host_module                = std::get<0>(_host_module_device_module_);
   auto& device_module              = std::get<1>(_host_module_device_module_);
-  LOG(INFO) << "[CUDA] host module:\n" << host_module;
+  VLOG(3) << "[CUDA] host module:\n" << host_module;
 
   {  // compile cuda device
-    LOG(INFO) << "[CUDA] device module:\n" << device_module;
+    VLOG(3) << "[CUDA] device module:\n" << device_module;
     CodeGenCUDA_Dev codegen(target_);
     auto source_code = codegen.Compile(device_module);
     if (!code.empty()) source_code = code;
-    LOG(INFO) << "[CUDA] source code:\n" << source_code;
+    VLOG(3) << "[CUDA] source code:\n" << source_code;
     using runtime::cuda::CUDAModule;
 
     backends::NVRTC_Compiler compiler;
 
@@ -98,7 +98,7 @@ std::unique_ptr<llvm::MemoryBuffer> NaiveObjectCache::getObject(const llvm::Modu
     return nullptr;
   }
 
-  LOG(INFO) << "Object for " << m->getModuleIdentifier() << " loaded from cache.";
+  VLOG(3) << "Object for " << m->getModuleIdentifier() << " loaded from cache.";
   return llvm::MemoryBuffer::getMemBuffer(it->second->getMemBufferRef());
 }
 
@@ -178,25 +178,25 @@ void ExecutionEngine::Link(const ir::Module &module) {
 
   decltype(auto) es = jit_->getExecutionSession();
   if (false) {
-    LOG(INFO) << "======= dump jit execution session ======";
+    VLOG(3) << "======= dump jit execution session ======";
     std::string buffer;
     llvm::raw_string_ostream os(buffer);
     es.dump(os);
     os.flush();
-    LOG(INFO) << buffer;
+    VLOG(3) << buffer;
   }
 }
 
 bool ExecutionEngine::AddModule(std::unique_ptr<llvm::Module> module, std::unique_ptr<llvm::LLVMContext> context) {
   module->setDataLayout(jit_->getDataLayout());
   if (false) {
-    LOG(INFO) << "======= dump jit lib ==========";
+    VLOG(3) << "======= dump jit lib ==========";
     std::string buffer;
     llvm::raw_string_ostream os(buffer);
     module->print(os, {});
     // main_jd_->dump(os);
     os.flush();
-    LOG(INFO) << buffer;
+    VLOG(3) << buffer;
   }
   llvm::orc::ThreadSafeContext tsc(std::move(context));
   llvm::orc::ThreadSafeModule tsm(std::move(module), std::move(tsc));
 
@@ -71,8 +71,8 @@ void SimpleJIT::AddModule(std::unique_ptr<llvm::Module> module, bool optimize) {
     module_pass_manager.run(*module, module_analysis_manager);
   }
 
-  LOG(INFO) << "jit target: " << jit_->getDataLayout().getStringRepresentation();
-  LOG(INFO) << "module target: " << module->getDataLayout().getStringRepresentation();
+  VLOG(3) << "jit target: " << jit_->getDataLayout().getStringRepresentation();
+  VLOG(3) << "module target: " << module->getDataLayout().getStringRepresentation();
 
   llvm::orc::ThreadSafeModule tsm(std::move(module), context_);
   llvm::cantFail(jit_->addIRModule(std::move(tsm)));
@@ -82,7 +82,7 @@ void SimpleJIT::AddModule(std::unique_ptr<llvm::Module> module, bool optimize) {
     llvm::raw_string_ostream os(buffer);
     jit_->getExecutionSession().dump(os);
     os.flush();
-    LOG(INFO) << "compiled jit:\n" << buffer;
+    VLOG(3) << "compiled jit:\n" << buffer;
   }
 }
 
 
@@ -91,7 +91,7 @@ std::string NVRTC_Compiler::CompilePTX(const std::string& code, bool include_hea
   for (const auto& option : compile_options) {
     param_cstrings.push_back(option.c_str());
   }
-  LOG(INFO) << "compile options: " << utils::Join(compile_options, " ");
+  VLOG(3) << "compile options: " << utils::Join(compile_options, " ");
   NVRTC_CALL(nvrtcCreateProgram(&prog, code.c_str(), nullptr, 0, nullptr, nullptr));
   nvrtcResult compile_res = nvrtcCompileProgram(prog, param_cstrings.size(), param_cstrings.data());
 
 
@@ -2005,8 +2005,8 @@ Expr CasSimplifyMutator::FurtherSimplifyFracWithInterval(
       auto it     = var_intervals.find(bv->name);
       auto ai_abs = std::abs(ai->value);
       if (it != var_intervals.end()) {
-        LOG(INFO) << "found " << bv->name << " " << it->second << " "
-                  << " ai " << ai_abs;
+        VLOG(3) << "found " << bv->name << " " << it->second << " "
+                << " ai " << ai_abs;
       }
       if (it != var_intervals.end() && std::abs(it->second.r) > ai_abs && std::abs(it->second.l) > ai_abs) {
         return make_const(a.type(), 0);
 
@@ -125,7 +125,7 @@ Expr RampRelatedMul(Expr a, Expr b) {
     CHECK_EQ(a_broadcast->lanes, b_broadcast->lanes);
     return ir::Broadcast::Make(a_broadcast->value * b_broadcast->value, a_broadcast->lanes);
   } else {
-    LOG(INFO) << "a,b: " << a << " " << b;
+    VLOG(3) << "a,b: " << a << " " << b;
     CINN_NOT_IMPLEMENTED
   }
 }
 
@@ -127,7 +127,7 @@ std::shared_ptr<CinnComputation> CinnComputation::CompilePaddleModel(
   }
   program->SetInputs({input_vars});
   program->Validate();
-  LOG(INFO) << "program:\n" << *program;
+  VLOG(3) << "program:\n" << *program;
 
   for (auto &name : fetch_names) {
     output_vars.push_back(varmap.at(name));
Original file line number	Diff line number	Diff line change
`@@ -71,8 +71,8 @@ void SimpleJIT::AddModule(std::unique_ptr<llvm::Module> module, bool optimize) {`
`71`	`71`	`module_pass_manager.run(*module, module_analysis_manager);`
`72`	`72`	`}`
`73`	`73`
`74`		`- LOG(INFO) << "jit target: " << jit_->getDataLayout().getStringRepresentation();`
`75`		`- LOG(INFO) << "module target: " << module->getDataLayout().getStringRepresentation();`
	`74`	`+ VLOG(3) << "jit target: " << jit_->getDataLayout().getStringRepresentation();`
	`75`	`+ VLOG(3) << "module target: " << module->getDataLayout().getStringRepresentation();`
`76`	`76`
`77`	`77`	`llvm::orc::ThreadSafeModule tsm(std::move(module), context_);`
`78`	`78`	`llvm::cantFail(jit_->addIRModule(std::move(tsm)));`
`@@ -82,7 +82,7 @@ void SimpleJIT::AddModule(std::unique_ptr<llvm::Module> module, bool optimize) {`
`82`	`82`	`llvm::raw_string_ostream os(buffer);`
`83`	`83`	`jit_->getExecutionSession().dump(os);`
`84`	`84`	`os.flush();`
`85`		`- LOG(INFO) << "compiled jit:\n" << buffer;`
	`85`	`+ VLOG(3) << "compiled jit:\n" << buffer;`
`86`	`86`	`}`
`87`	`87`	`}`
`88`	`88`
Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,7 @@ std::string NVRTC_Compiler::CompilePTX(const std::string& code, bool include_hea`
`91`	`91`	`for (const auto& option : compile_options) {`
`92`	`92`	`param_cstrings.push_back(option.c_str());`
`93`	`93`	`}`
`94`		`- LOG(INFO) << "compile options: " << utils::Join(compile_options, " ");`
	`94`	`+ VLOG(3) << "compile options: " << utils::Join(compile_options, " ");`
`95`	`95`	`NVRTC_CALL(nvrtcCreateProgram(&prog, code.c_str(), nullptr, 0, nullptr, nullptr));`
`96`	`96`	`nvrtcResult compile_res = nvrtcCompileProgram(prog, param_cstrings.size(), param_cstrings.data());`
`97`	`97`
Original file line number	Diff line number	Diff line change
`@@ -125,7 +125,7 @@ Expr RampRelatedMul(Expr a, Expr b) {`
`125`	`125`	`CHECK_EQ(a_broadcast->lanes, b_broadcast->lanes);`
`126`	`126`	`return ir::Broadcast::Make(a_broadcast->value * b_broadcast->value, a_broadcast->lanes);`
`127`	`127`	`} else {`
`128`		`- LOG(INFO) << "a,b: " << a << " " << b;`
	`128`	`+ VLOG(3) << "a,b: " << a << " " << b;`
`129`	`129`	`CINN_NOT_IMPLEMENTED`
`130`	`130`	`}`
`131`	`131`	`}`
Original file line number	Diff line number	Diff line change
`@@ -127,7 +127,7 @@ std::shared_ptr<CinnComputation> CinnComputation::CompilePaddleModel(`
`127`	`127`	`}`
`128`	`128`	`program->SetInputs({input_vars});`
`129`	`129`	`program->Validate();`
`130`		`- LOG(INFO) << "program:\n" << *program;`
	`130`	`+ VLOG(3) << "program:\n" << *program;`
`131`	`131`
`132`	`132`	`for (auto &name : fetch_names) {`
`133`	`133`	`output_vars.push_back(varmap.at(name));`