77#include " cuda_allocator.h"
88#include " core/framework/kernel_registry.h"
99#include " core/framework/compute_capability.h"
10+ #include " core/framework/fallback_cpu_capability.h"
1011#include " core/framework/memcpy.h"
1112#include " core/graph/graph_utils.h"
1213#include " core/providers/cuda/gpu_data_transfer.h"
@@ -1822,9 +1823,7 @@ std::unique_ptr<onnxruntime::IDataTransfer> CUDAExecutionProvider::GetDataTransf
18221823std::vector<std::unique_ptr<ComputeCapability>>
18231824CUDAExecutionProvider::GetCapability (const onnxruntime::GraphViewer& graph,
18241825 const std::vector<const KernelRegistry*>& kernel_registries) const {
1825- std::vector<std::unique_ptr<ComputeCapability>> result;
1826- std::unordered_set<const NodeArg*> defs_outside_cuda;
1827-
1826+ std::vector<NodeIndex> candidates;
18281827 for (auto & node_index : graph.GetNodesInTopologicalOrder ()) {
18291828 const auto * p_node = graph.GetNode (node_index);
18301829 if (p_node == nullptr )
@@ -1833,7 +1832,6 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
18331832 const auto & node = *p_node;
18341833 const KernelCreateInfo* cuda_kernel_def = nullptr ;
18351834 if (!node.GetExecutionProviderType ().empty ()) {
1836- defs_outside_cuda.insert (node.OutputDefs ().cbegin (), node.OutputDefs ().cend ());
18371835 continue ;
18381836 }
18391837
@@ -1847,14 +1845,10 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
18471845
18481846 // none of the provided registries has a CUDA kernel for this node
18491847 if (cuda_kernel_def == nullptr ) {
1850- // node is not in cuda exeuction provider if no kernel def found,
1851- // or if other execution provider already assigned to it
1852- defs_outside_cuda.insert (node.OutputDefs ().cbegin (), node.OutputDefs ().cend ());
18531848 continue ;
18541849 }
18551850
18561851 bool not_supported = false ;
1857- bool force_outside = false ;
18581852 bool force_inside = false ; // for some compute heavy ops, we'll force it to run inside CUDA
18591853 if (" LSTM" == node.OpType ()) {
18601854 // the supported activations covers the bidirectional mode
@@ -1877,60 +1871,29 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
18771871 // cast is not compute heavy, and may be placed outside
18781872 }
18791873
1880- // Below rule only works for inference, for training, we can't do constant folding.
1881- // We need find a better solution.
1882- // Temporary disable the check here, the cost is all the cast will be on GPU now.
1883- #ifndef ENABLE_TRAINING
1884- if (!not_supported && !force_inside) {
1885- // Note that nodes with only inputs from initializer would not be place on CUDA
1886- // Ideally, those nodes should be eliminated in constant folding
1887- bool should_force_outside = true ;
1888- bool all_inputs_are_initializers = true ;
1889- ORT_THROW_IF_ERROR (node.ForEachWithIndex (node.InputDefs (),
1890- [&](const NodeArg& def, size_t index) {
1891- // The input is not a initializer and the input is from CPU
1892- // or the input declared as CPU memory and is from CPU
1893- // in that case we should still keep the node on CUDA
1894- bool initializer_input = graph.IsConstantInitializer (def.Name (), /* check_outer_scope*/ true );
1895- bool input_is_on_cpu = defs_outside_cuda.count (&def) > 0 ;
1896- if ((!initializer_input && !input_is_on_cpu) ||
1897- (input_is_on_cpu && cuda_kernel_def->kernel_def ->IsInputOnCpu (index))) {
1898- should_force_outside = false ;
1899- }
1900-
1901- if (!initializer_input) {
1902- all_inputs_are_initializers = false ;
1903- }
1904- return Status::OK ();
1905- }));
1906-
1907- // If all the inputs are initializers, we shouldn't force it to CPU
1908- if (should_force_outside && !all_inputs_are_initializers) {
1909- force_outside = true ;
1910- }
1911- }
1912- #endif
1913- if (!force_inside && (not_supported || force_outside)) {
1914- defs_outside_cuda.insert (node.OutputDefs ().cbegin (), node.OutputDefs ().cend ());
1874+ if (!force_inside && not_supported) {
19151875 if (not_supported) {
19161876 LOGS_DEFAULT (WARNING) << " CUDA kernel not supported. Fallback to CPU execution provider for Op type: " << node.OpType () << " node name: " << node.Name ();
1917- } else if (force_outside) {
1918- LOGS_DEFAULT (INFO) << " Force fallback to CPU execution provider for Op type: " << node.OpType () << " node name: " << node.Name ();
19191877 }
19201878 } else {
1921- // for nodes placed on CUDA, check if its output is on CPU
1922- ORT_THROW_IF_ERROR (node.ForEachWithIndex (
1923- node.OutputDefs (),
1924- [&](const NodeArg& def, size_t out_index) {
1925- if (cuda_kernel_def->kernel_def ->OutputMemoryType (out_index) != OrtMemTypeDefault)
1926- defs_outside_cuda.insert (&def);
1927- return Status::OK ();
1928- }));
1929- std::unique_ptr<IndexedSubGraph> sub_graph = onnxruntime::make_unique<IndexedSubGraph>();
1930- sub_graph->nodes .push_back (node.Index ());
1931- result.push_back (onnxruntime::make_unique<ComputeCapability>(std::move (sub_graph)));
1879+ candidates.push_back (node.Index ());
19321880 }
19331881 }
1882+
1883+ // For CUDA EP, exclude the subgraph that is preferred to be placed in CPU
1884+ // These are usually shape related computation subgraphs
1885+ // Following logic can be extended for other EPs
1886+ std::unordered_set<NodeIndex> cpu_nodes = GetCpuPreferedNodes (graph, Type (), kernel_registries, candidates);
1887+
1888+ std::vector<std::unique_ptr<ComputeCapability>> result;
1889+ for (auto & node_index : candidates) {
1890+ if (cpu_nodes.count (node_index) > 0 )
1891+ continue ;
1892+
1893+ std::unique_ptr<IndexedSubGraph> sub_graph = onnxruntime::make_unique<IndexedSubGraph>();
1894+ sub_graph->nodes .push_back (node_index);
1895+ result.push_back (onnxruntime::make_unique<ComputeCapability>(std::move (sub_graph)));
1896+ }
19341897 return result;
19351898}
19361899
0 commit comments