Skip to content
This repository was archived by the owner on Jan 3, 2023. It is now read-only.

Commit 08c4c57

Browse files
sasadepdiyessi
authored andcommitted
add OpenMPI support besides MLSL (#2353)
* quick fix to add openmpi as default * add finalize to distributed class & use unit test * use intel mlsl github link * apply style * address a few comments * fix test * update nbench cmake * remove extras * fix a bug * add counter to finalize and cleanup * test ci error * address mlsl ci error * update flag names, as mentioned in pr comment * revert back the link to mlsl repo and tag * add flag to finalize * apply style * debug with info * delete when flag is true * add distributed setup class works, tests pass * fix style * remove extra instance * disable the test due to a bug * change flag to ompi * remove the dependency of setting NGRAPH_DISTRIBUTED_ENABLE flag * cleanup * change extern to static * remove the option NGRAPH_DISTRIBUTED_ENABLE setting this flag * formatting * update flags not catched by ci * make unique pointer * remove unused bool, fix clang error
1 parent 13b4966 commit 08c4c57

26 files changed

+407
-84
lines changed

CMakeLists.txt

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,19 +111,31 @@ option(NGRAPH_INTERPRETER_ENABLE "Control the building of the INTERPRETER backen
111111
option(NGRAPH_NOP_ENABLE "Control the building of the NOP backend" TRUE)
112112
option(NGRAPH_GPUH_ENABLE "Control the building of the Hybrid GPU backend" FALSE)
113113
option(NGRAPH_GENERIC_CPU_ENABLE "Enable build nGraph for generic CPU backend" FALSE)
114-
option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" FALSE)
115114
option(NGRAPH_DEBUG_ENABLE "Enable output for NGRAPH_DEBUG statements" FALSE)
116115
option(NGRAPH_ONNX_IMPORT_ENABLE "Enable ONNX importer" FALSE)
117116
option(NGRAPH_DEX_ONLY "Build CPU DEX without codegen" FALSE)
118117
option(NGRAPH_CODE_COVERAGE_ENABLE "Enable code coverage data collection" FALSE)
119118
option(NGRAPH_LIB_VERSIONING_ENABLE "Enable shared library versioning" FALSE)
120119
option(NGRAPH_PYTHON_BUILD_ENABLE "Enable build nGraph python package wheel" FALSE)
120+
option(NGRAPH_DISTRIBUTED_MLSL_ENABLE "Add distributed MLSL mode for CPU only backend" FALSE)
121+
option(NGRAPH_DISTRIBUTED_OMPI_ENABLE "Add distributed Open-MPI mode for all backend" FALSE)
121122
option(NGRAPH_PLAIDML_ENABLE "Enable the PlaidML backend" ${PLAIDML_FOUND})
122123

123124
if (NGRAPH_GPUH_ENABLE)
124125
set(NGRAPH_GPU_ENABLE TRUE)
125126
endif()
126127

128+
if (NGRAPH_DISTRIBUTED_MLSL_ENABLE AND NGRAPH_DISTRIBUTED_OMPI_ENABLE)
129+
message(FATAL_ERROR
130+
"Does not support the use of two distributed libraries simultaneously.\n"
131+
"If CPU only backend recommend Intel MLSL by setting NGRAPH_DISTRIBUTED_MLSL_ENABLE flag to true.\n"
132+
"For all other backends use OpenMPI by setting NGRAPH_DISTRIBUTED_OMPI_ENABLE flag to true.\n")
133+
elseif(NGRAPH_DISTRIBUTED_MLSL_ENABLE OR NGRAPH_DISTRIBUTED_OMPI_ENABLE)
134+
set(NGRAPH_DISTRIBUTED_ENABLE TRUE)
135+
else()
136+
set(NGRAPH_DISTRIBUTED_ENABLE FALSE)
137+
endif()
138+
127139
if (NGRAPH_ONNX_IMPORT_ENABLE)
128140
option(NGRAPH_USE_SYSTEM_PROTOBUF "Use system provided Protobuf shared object" FALSE)
129141
option(NGRAPH_ONNXIFI_ENABLE "Enable ONNX Interface for Framework Integration" TRUE)
@@ -138,7 +150,8 @@ message(STATUS "NGRAPH_INTERPRETER_ENABLE: ${NGRAPH_INTERPRETER_ENABLE}")
138150
message(STATUS "NGRAPH_NOP_ENABLE: ${NGRAPH_NOP_ENABLE}")
139151
message(STATUS "NGRAPH_GPUH_ENABLE: ${NGRAPH_GPUH_ENABLE}")
140152
message(STATUS "NGRAPH_GENERIC_CPU_ENABLE: ${NGRAPH_GENERIC_CPU_ENABLE}")
141-
message(STATUS "NGRAPH_DISTRIBUTED_ENABLE: ${NGRAPH_DISTRIBUTED_ENABLE}")
153+
message(STATUS "NGRAPH_DISTRIBUTED_MLSL_ENABLE: ${NGRAPH_DISTRIBUTED_MLSL_ENABLE}")
154+
message(STATUS "NGRAPH_DISTRIBUTED_OMPI_ENABLE: ${NGRAPH_DISTRIBUTED_OMPI_ENABLE}")
142155
message(STATUS "NGRAPH_DEBUG_ENABLE: ${NGRAPH_DEBUG_ENABLE}")
143156
message(STATUS "NGRAPH_ONNX_IMPORT_ENABLE: ${NGRAPH_ONNX_IMPORT_ENABLE}")
144157
message(STATUS "NGRAPH_DEX_ONLY: ${NGRAPH_DEX_ONLY}")
@@ -260,6 +273,15 @@ if (NGRAPH_PLAIDML_ENABLE)
260273
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNGRAPH_PlaidML_ENABLE")
261274
endif()
262275

276+
if (NGRAPH_DISTRIBUTED_ENABLE)
277+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNGRAPH_DISTRIBUTED_ENABLE")
278+
if (NGRAPH_DISTRIBUTED_MLSL_ENABLE)
279+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNGRAPH_DISTRIBUTED_MLSL_ENABLE")
280+
elseif (NGRAPH_DISTRIBUTED_OMPI_ENABLE)
281+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNGRAPH_DISTRIBUTED_OMPI_ENABLE")
282+
endif()
283+
endif()
284+
263285
if (NOT DEFINED NGRAPH_TBB_ENABLE)
264286
set(NGRAPH_TBB_ENABLE ${NGRAPH_CPU_ENABLE})
265287
endif()
@@ -336,11 +358,11 @@ if (WIN32 OR APPLE)
336358
else()
337359
include(cmake/external_tbb.cmake)
338360
endif()
339-
if (NGRAPH_DISTRIBUTED_ENABLE)
361+
362+
if (NGRAPH_DISTRIBUTED_MLSL_ENABLE)
340363
include(cmake/external_mlsl.cmake)
341364
endif()
342365

343-
344366
if (NGRAPH_HALIDE)
345367
message(WARNING "Halide build system integration is currently using an older LLVM release \
346368
and is not expected to work across most build environments. Consider \

doc/examples/mnist_mlp/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ add_dependencies(mnist_mlp ngraph cpu_backend)
1919
target_link_libraries(mnist_mlp ngraph cpu_backend)
2020
if (NGRAPH_DISTRIBUTED_ENABLE)
2121
add_executable(dist_mnist_mlp mnist_loader.cpp dist_mnist_mlp.cpp)
22-
target_compile_definitions(dist_mnist_mlp PRIVATE NGRAPH_DISTRIBUTED)
22+
target_compile_definitions(dist_mnist_mlp PRIVATE NGRAPH_DISTRIBUTED_ENABLE)
2323
target_include_directories(dist_mnist_mlp SYSTEM PRIVATE libmlsl)
2424
target_link_libraries(dist_mnist_mlp ngraph cpu_backend libmlsl)
2525
endif()

src/ngraph/CMakeLists.txt

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,7 @@ set (SRC
168168
placement.cpp
169169
cpio.cpp
170170
)
171+
171172
if(NGRAPH_DISTRIBUTED_ENABLE)
172173
list(APPEND SRC distributed.cpp)
173174
endif()
@@ -178,9 +179,16 @@ add_library(ngraph SHARED ${SRC})
178179

179180
if(NGRAPH_DISTRIBUTED_ENABLE)
180181
target_sources(ngraph PRIVATE distributed.cpp)
181-
target_compile_definitions(ngraph PRIVATE NGRAPH_DISTRIBUTED)
182-
target_include_directories(ngraph SYSTEM PRIVATE libmlsl)
183-
target_link_libraries(ngraph PRIVATE libmlsl)
182+
if(NGRAPH_DISTRIBUTED_MLSL_ENABLE)
183+
target_include_directories(ngraph SYSTEM PRIVATE libmlsl)
184+
target_link_libraries(ngraph PRIVATE libmlsl)
185+
elseif(NGRAPH_DISTRIBUTED_OMPI_ENABLE)
186+
find_package(MPI REQUIRED)
187+
target_include_directories(ngraph SYSTEM PRIVATE ${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH})
188+
target_link_libraries(ngraph PRIVATE ${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES})
189+
else()
190+
message(FATAL_ERROR "Distributed Library not supported/mentioned")
191+
endif()
184192
endif()
185193

186194
add_subdirectory(frontend)

src/ngraph/codegen/CMakeLists.txt

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,15 @@ if ((NGRAPH_GPU_ENABLE OR NGRAPH_CPU_ENABLE) AND NOT NGRAPH_DEX_ONLY)
4747
list(APPEND HEADER_SEARCH_DEFINES NGRAPH_HEADERS_PATH="${NGRAPH_INCLUDE_PATH}")
4848

4949
if(NGRAPH_DISTRIBUTED_ENABLE)
50-
get_target_property(MLSL_INCLUDE_DIR libmlsl INTERFACE_INCLUDE_DIRECTORIES)
51-
list(APPEND HEADER_SEARCH_DEFINES MLSL_HEADER_PATH="${MLSL_INCLUDE_DIR}")
52-
add_definitions(-DNGRAPH_DISTRIBUTED)
50+
if (NGRAPH_DISTRIBUTED_MLSL_ENABLE)
51+
get_target_property(MLSL_INCLUDE_DIR libmlsl INTERFACE_INCLUDE_DIRECTORIES)
52+
list(APPEND HEADER_SEARCH_DEFINES MLSL_HEADER_PATH="${MLSL_INCLUDE_DIR}")
53+
elseif(NGRAPH_DISTRIBUTED_OMPI_ENABLE)
54+
find_package(MPI REQUIRED)
55+
add_definitions(-DMPI_HEADER_PATH="${MPI_PATH}")
56+
else()
57+
message(FATAL_ERROR "Distributed Library not supported/mentioned")
58+
endif()
5359
endif()
5460

5561
if(NGRAPH_GPU_ENABLE)

src/ngraph/codegen/compiler.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -472,8 +472,14 @@ void codegen::CompilerCore::configure_search_path()
472472
add_header_search_path(CUDNN_HEADER_PATHS);
473473
#endif
474474

475-
#ifdef NGRAPH_DISTRIBUTED
475+
#ifdef NGRAPH_DISTRIBUTED_ENABLE
476+
#ifdef NGRAPH_DISTRIBUTED_MLSL_ENABLE
476477
add_header_search_path(MLSL_HEADER_PATH);
478+
#elif NGRAPH_DISTRIBUTED_OMPI_ENABLE
479+
add_header_search_path(MPI_HEADER_PATH);
480+
#else
481+
throw ngraph_error("Distributed Library not supported/mentioned");
482+
#endif
477483
#endif
478484
}
479485

src/ngraph/distributed.cpp

Lines changed: 58 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,37 +14,90 @@
1414
// limitations under the License.
1515
//*****************************************************************************
1616

17-
#ifdef NGRAPH_DISTRIBUTED
17+
#ifdef NGRAPH_DISTRIBUTED_ENABLE
1818

19+
#ifdef NGRAPH_DISTRIBUTED_MLSL_ENABLE
1920
#include <mlsl.hpp>
21+
#elif NGRAPH_DISTRIBUTED_OMPI_ENABLE
22+
#include <mpi.h>
23+
#endif
2024

2125
#include "ngraph/distributed.hpp"
26+
#include "ngraph/log.hpp"
2227

2328
using namespace ngraph;
2429

2530
ngraph::Distributed::Distributed()
2631
{
32+
#ifdef NGRAPH_DISTRIBUTED_MLSL_ENABLE
2733
if (!MLSL::Environment::GetEnv().IsInitialized())
2834
{
2935
MLSL::Environment::GetEnv().Init(nullptr, nullptr);
36+
this_init_comm = true;
37+
}
38+
#elif NGRAPH_DISTRIBUTED_OMPI_ENABLE
39+
int flag = 0;
40+
MPI_Initialized(&flag);
41+
if (!flag)
42+
{
43+
MPI_Init(NULL, NULL);
44+
this_init_comm = true;
3045
}
46+
#else
47+
throw ngraph_error("Distributed Library not supported/mentioned");
48+
#endif
3149
}
3250

3351
ngraph::Distributed::~Distributed()
3452
{
53+
if (this_init_comm == true)
54+
{
55+
finalize();
56+
}
57+
}
58+
59+
void ngraph::Distributed::finalize()
60+
{
61+
#ifdef NGRAPH_DISTRIBUTED_MLSL_ENABLE
3562
if (MLSL::Environment::GetEnv().IsInitialized())
3663
{
3764
MLSL::Environment::GetEnv().Finalize();
3865
}
66+
#elif NGRAPH_DISTRIBUTED_OMPI_ENABLE
67+
int flag = 0;
68+
MPI_Initialized(&flag);
69+
if (flag)
70+
{
71+
MPI_Finalize();
72+
}
73+
#else
74+
throw ngraph_error("Distributed Library not supported/mentioned");
75+
#endif
3976
}
4077

41-
size_t ngraph::Distributed::get_size() const
78+
int ngraph::Distributed::get_size() const
4279
{
43-
return MLSL::Environment::GetEnv().GetProcessCount();
80+
#ifdef NGRAPH_DISTRIBUTED_MLSL_ENABLE
81+
return static_cast<int>(MLSL::Environment::GetEnv().GetProcessCount());
82+
#elif NGRAPH_DISTRIBUTED_OMPI_ENABLE
83+
int size;
84+
MPI_Comm_size(MPI_COMM_WORLD, &size);
85+
return size;
86+
#else
87+
throw ngraph_error("Distributed Library not supported/mentioned");
88+
#endif
4489
}
4590

46-
size_t ngraph::Distributed::get_rank() const
91+
int ngraph::Distributed::get_rank() const
4792
{
48-
return MLSL::Environment::GetEnv().GetProcessIdx();
93+
#ifdef NGRAPH_DISTRIBUTED_MLSL_ENABLE
94+
return static_cast<int>(MLSL::Environment::GetEnv().GetProcessIdx());
95+
#elif NGRAPH_DISTRIBUTED_OMPI_ENABLE
96+
int rank;
97+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
98+
return rank;
99+
#else
100+
throw ngraph_error("Distributed Library not supported/mentioned");
101+
#endif
49102
}
50103
#endif

src/ngraph/distributed.hpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,11 @@ namespace ngraph
2525
public:
2626
Distributed();
2727
~Distributed();
28-
size_t get_size() const;
29-
size_t get_rank() const;
28+
int get_size() const;
29+
int get_rank() const;
30+
31+
private:
32+
bool this_init_comm;
33+
void finalize();
3034
};
3135
}

src/ngraph/runtime/cpu/CMakeLists.txt

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -202,9 +202,18 @@ if (NGRAPH_CPU_ENABLE)
202202
target_compile_definitions(cpu_backend PRIVATE CPU_BACKEND_DLL_EXPORTS)
203203

204204
if(NGRAPH_DISTRIBUTED_ENABLE)
205-
target_compile_definitions(cpu_backend PRIVATE NGRAPH_DISTRIBUTED)
206-
target_include_directories(cpu_backend SYSTEM PRIVATE libmlsl)
207-
target_link_libraries(cpu_backend PRIVATE libmlsl)
205+
if(NGRAPH_DISTRIBUTED_MLSL_ENABLE)
206+
target_include_directories(cpu_backend SYSTEM PRIVATE libmlsl)
207+
target_link_libraries(cpu_backend PRIVATE libmlsl)
208+
elseif(NGRAPH_DISTRIBUTED_OMPI_ENABLE)
209+
find_package(MPI REQUIRED)
210+
target_include_directories(cpu_backend
211+
SYSTEM PRIVATE ${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH})
212+
target_link_libraries(cpu_backend
213+
PRIVATE ${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES})
214+
else()
215+
message(FATAL_ERROR "Distributed Library not supported/mentioned")
216+
endif()
208217
endif()
209218

210219
add_dependencies(cpu_backend ext_mkldnn ext_eigen)

src/ngraph/runtime/cpu/builder/allreduce.cpp

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,13 @@
1313
// See the License for the specific language governing permissions and
1414
// limitations under the License.
1515
//*****************************************************************************
16-
#ifdef NGRAPH_DISTRIBUTED
16+
#ifdef NGRAPH_DISTRIBUTED_ENABLE
1717

18+
#ifdef NGRAPH_DISTRIBUTED_MLSL_ENABLE
1819
#include <mlsl.hpp>
20+
#elif NGRAPH_DISTRIBUTED_OMPI_ENABLE
21+
#include <mpi.h>
22+
#endif
1923

2024
#include "ngraph/op/allreduce.hpp"
2125
#include "ngraph/runtime/cpu/cpu_builder.hpp"
@@ -37,6 +41,8 @@ namespace ngraph
3741
auto& arg_tensor = external_function->get_tensor_data(args[0].get_name());
3842
auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
3943
auto count = static_cast<int>(out[0].get_size());
44+
45+
#ifdef NGRAPH_DISTRIBUTED_MLSL_ENABLE
4046
auto data_type = MLSL::DT_FLOAT;
4147

4248
if (args[0].get_element_type() == element::f32)
@@ -54,7 +60,26 @@ namespace ngraph
5460
arg_tensor, out_tensor, count, data_type, MLSL::RT_SUM, MLSL::GT_DATA);
5561
ctx->mlsl_env->Wait(req);
5662
};
63+
#elif NGRAPH_DISTRIBUTED_OMPI_ENABLE
64+
auto data_type = MPI_FLOAT;
5765

66+
if (args[0].get_element_type() == element::f32)
67+
{
68+
data_type = MPI_FLOAT;
69+
}
70+
else if (args[0].get_element_type() == element::f64)
71+
{
72+
data_type = MPI_DOUBLE;
73+
}
74+
75+
auto functor = [&, count, data_type](CPURuntimeContext* ctx,
76+
CPUExecutionContext* ectx) {
77+
MPI_Allreduce(
78+
arg_tensor, out_tensor, count, data_type, MPI_SUM, MPI_COMM_WORLD);
79+
};
80+
#else
81+
throw ngraph_error("Distributed Library not supported/mentioned");
82+
#endif
5883
functors.emplace_back(functor);
5984
}
6085

src/ngraph/runtime/cpu/cpu_builder.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,11 @@
103103
#include "ngraph/type/element_type.hpp"
104104
#include "ngraph/util.hpp"
105105

106+
#ifdef NGRAPH_DISTRIBUTED_OMPI_ENABLE
107+
#include <mpi.h>
108+
#include "ngraph/op/allreduce.hpp"
109+
#endif
110+
106111
using namespace std;
107112
using namespace ngraph;
108113

0 commit comments

Comments
 (0)