Skip to content

patch for ci test #686

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ members = [
"monarch_tensor_worker",
"monarch_rdma",
"nccl-sys",
"rdmacore-sys",
"rdmaxcel-sys",
"torch-sys",
"rdmacore-sys",
"cuda-sys",
]
54 changes: 41 additions & 13 deletions cuda-sys/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ fn emit_cuda_link_directives(cuda_home: &str) {
}

fn python_env_dirs() -> (Option<String>, Option<String>) {
let output = std::process::Command::new(PathBuf::from("python"))
let output = std::process::Command::new(PathBuf::from("python3"))
.arg("-c")
.arg(PYTHON_PRINT_DIRS)
.output()
Expand All @@ -94,13 +94,13 @@ fn python_env_dirs() -> (Option<String>, Option<String>) {
}

fn main() {
// Start building the bindgen configuration
let mut builder = bindgen::Builder::default()
// The input header we would like to generate bindings for
.header("src/wrapper.h")
.clang_arg("-x")
.clang_arg("c++")
.clang_arg("-std=gnu++20")
.clang_arg(format!("-I{}/include", find_cuda_home().unwrap()))
.parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
// Allow the specified functions and types
.allowlist_function("cu.*")
Expand All @@ -113,6 +113,21 @@ fn main() {
is_global: false,
});

// Add CUDA include path if available
if let Some(cuda_home) = find_cuda_home() {
let cuda_include_path = format!("{}/include", cuda_home);
if Path::new(&cuda_include_path).exists() {
builder = builder.clang_arg(format!("-I{}", cuda_include_path));
} else {
eprintln!(
"Warning: CUDA include directory not found at {}",
cuda_include_path
);
}
} else {
eprintln!("Warning: CUDA home directory not found. Continuing without CUDA include path.");
}

// Include headers and libs from the active environment.
let (include_dir, lib_dir) = python_env_dirs();
if let Some(include_dir) = include_dir {
Expand All @@ -129,15 +144,28 @@ fn main() {
}

// Write the bindings to the $OUT_DIR/bindings.rs file
let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
builder
.generate()
.expect("Unable to generate bindings")
.write_to_file(out_path.join("bindings.rs"))
.expect("Couldn't write bindings!");

println!("cargo:rustc-link-lib=cuda");
println!("cargo:rustc-link-lib=cudart");
println!("cargo::rustc-cfg=cargo");
println!("cargo::rustc-check-cfg=cfg(cargo)");
match env::var("OUT_DIR") {
Ok(out_dir) => {
let out_path = PathBuf::from(out_dir);
match builder.generate() {
Ok(bindings) => match bindings.write_to_file(out_path.join("bindings.rs")) {
Ok(_) => {
println!("cargo:rustc-link-lib=cuda");
println!("cargo:rustc-link-lib=cudart");
println!("cargo::rustc-cfg=cargo");
println!("cargo::rustc-check-cfg=cfg(cargo)");
}
Err(e) => eprintln!("Warning: Couldn't write bindings: {}", e),
},
Err(e) => eprintln!("Warning: Unable to generate bindings: {}", e),
}
}
Err(_) => {
// When running via buck2 run, OUT_DIR might not be set
// This is expected and not an error - we're just running the script directly
// The actual build will happen later with cargo
println!("cargo:rustc-link-lib=cuda");
println!("cargo:rustc-link-lib=cudart");
}
}
}
2 changes: 1 addition & 1 deletion monarch_rdma/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ async-trait = "0.1.86"
cuda-sys = { path = "../cuda-sys" }
hyperactor = { version = "0.0.0", path = "../hyperactor" }
rand = { version = "0.8", features = ["small_rng"] }
rdmacore-sys = { path = "../rdmacore-sys" }
rdmaxcel-sys = { path = "../rdmaxcel-sys" }
serde = { version = "1.0.185", features = ["derive", "rc"] }
tracing = { version = "0.1.41", features = ["attributes", "valuable"] }

Expand Down
96 changes: 48 additions & 48 deletions monarch_rdma/src/ibverbs_primitives.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,29 +60,29 @@ impl Gid {
u64::from_be_bytes(self.raw[8..].try_into().unwrap())
}
}
impl From<rdmacore_sys::ibv_gid> for Gid {
fn from(gid: rdmacore_sys::ibv_gid) -> Self {
impl From<rdmaxcel_sys::ibv_gid> for Gid {
fn from(gid: rdmaxcel_sys::ibv_gid) -> Self {
Self {
raw: unsafe { gid.raw },
}
}
}

impl From<Gid> for rdmacore_sys::ibv_gid {
impl From<Gid> for rdmaxcel_sys::ibv_gid {
fn from(mut gid: Gid) -> Self {
*gid.as_mut()
}
}

impl AsRef<rdmacore_sys::ibv_gid> for Gid {
fn as_ref(&self) -> &rdmacore_sys::ibv_gid {
unsafe { &*self.raw.as_ptr().cast::<rdmacore_sys::ibv_gid>() }
impl AsRef<rdmaxcel_sys::ibv_gid> for Gid {
fn as_ref(&self) -> &rdmaxcel_sys::ibv_gid {
unsafe { &*self.raw.as_ptr().cast::<rdmaxcel_sys::ibv_gid>() }
}
}

impl AsMut<rdmacore_sys::ibv_gid> for Gid {
fn as_mut(&mut self) -> &mut rdmacore_sys::ibv_gid {
unsafe { &mut *self.raw.as_mut_ptr().cast::<rdmacore_sys::ibv_gid>() }
impl AsMut<rdmaxcel_sys::ibv_gid> for Gid {
fn as_mut(&mut self) -> &mut rdmaxcel_sys::ibv_gid {
unsafe { &mut *self.raw.as_mut_ptr().cast::<rdmaxcel_sys::ibv_gid>() }
}
}

Expand Down Expand Up @@ -143,7 +143,7 @@ impl Default for IbverbsConfig {
max_recv_wr: 1,
max_send_sge: 1,
max_recv_sge: 1,
path_mtu: rdmacore_sys::IBV_MTU_1024,
path_mtu: rdmaxcel_sys::IBV_MTU_1024,
retry_cnt: 7,
rnr_retry: 7,
qp_timeout: 14, // 4.096 μs * 2^14 = ~67 ms
Expand Down Expand Up @@ -387,10 +387,10 @@ impl fmt::Display for RdmaPort {
/// # Returns
///
/// A string representation of the port state.
pub fn get_port_state_str(state: rdmacore_sys::ibv_port_state::Type) -> String {
pub fn get_port_state_str(state: rdmaxcel_sys::ibv_port_state::Type) -> String {
// SAFETY: We are calling a C function that returns a C string.
unsafe {
let c_str = rdmacore_sys::ibv_port_state_str(state);
let c_str = rdmaxcel_sys::ibv_port_state_str(state);
if c_str.is_null() {
return "Unknown".to_string();
}
Expand Down Expand Up @@ -485,7 +485,7 @@ pub fn get_all_devices() -> Vec<RdmaDevice> {
// SAFETY: We are calling several C functions from libibverbs.
unsafe {
let mut num_devices = 0;
let device_list = rdmacore_sys::ibv_get_device_list(&mut num_devices);
let device_list = rdmaxcel_sys::ibv_get_device_list(&mut num_devices);
if device_list.is_null() || num_devices == 0 {
return devices;
}
Expand All @@ -496,18 +496,18 @@ pub fn get_all_devices() -> Vec<RdmaDevice> {
continue;
}

let context = rdmacore_sys::ibv_open_device(device);
let context = rdmaxcel_sys::ibv_open_device(device);
if context.is_null() {
continue;
}

let device_name = CStr::from_ptr(rdmacore_sys::ibv_get_device_name(device))
let device_name = CStr::from_ptr(rdmaxcel_sys::ibv_get_device_name(device))
.to_string_lossy()
.into_owned();

let mut device_attr = rdmacore_sys::ibv_device_attr::default();
if rdmacore_sys::ibv_query_device(context, &mut device_attr) != 0 {
rdmacore_sys::ibv_close_device(context);
let mut device_attr = rdmaxcel_sys::ibv_device_attr::default();
if rdmaxcel_sys::ibv_query_device(context, &mut device_attr) != 0 {
rdmaxcel_sys::ibv_close_device(context);
continue;
}

Expand All @@ -532,11 +532,11 @@ pub fn get_all_devices() -> Vec<RdmaDevice> {
};

for port_num in 1..=device_attr.phys_port_cnt {
let mut port_attr = rdmacore_sys::ibv_port_attr::default();
if rdmacore_sys::ibv_query_port(
let mut port_attr = rdmaxcel_sys::ibv_port_attr::default();
if rdmaxcel_sys::ibv_query_port(
context,
port_num,
&mut port_attr as *mut rdmacore_sys::ibv_port_attr as *mut _,
&mut port_attr as *mut rdmaxcel_sys::ibv_port_attr as *mut _,
) != 0
{
continue;
Expand All @@ -546,8 +546,8 @@ pub fn get_all_devices() -> Vec<RdmaDevice> {

let link_layer = get_link_layer_str(port_attr.link_layer);

let mut gid = rdmacore_sys::ibv_gid::default();
let gid_str = if rdmacore_sys::ibv_query_gid(context, port_num, 0, &mut gid) == 0 {
let mut gid = rdmaxcel_sys::ibv_gid::default();
let gid_str = if rdmaxcel_sys::ibv_query_gid(context, port_num, 0, &mut gid) == 0 {
format_gid(&gid.raw)
} else {
"N/A".to_string()
Expand All @@ -570,10 +570,10 @@ pub fn get_all_devices() -> Vec<RdmaDevice> {
}

devices.push(rdma_device);
rdmacore_sys::ibv_close_device(context);
rdmaxcel_sys::ibv_close_device(context);
}

rdmacore_sys::ibv_free_device_list(device_list);
rdmaxcel_sys::ibv_free_device_list(device_list);
}

devices
Expand All @@ -592,9 +592,9 @@ pub fn ibverbs_supported() -> bool {
// SAFETY: We are calling a C function from libibverbs.
unsafe {
let mut num_devices = 0;
let device_list = rdmacore_sys::ibv_get_device_list(&mut num_devices);
let device_list = rdmaxcel_sys::ibv_get_device_list(&mut num_devices);
if !device_list.is_null() {
rdmacore_sys::ibv_free_device_list(device_list);
rdmaxcel_sys::ibv_free_device_list(device_list);
}
num_devices > 0
}
Expand Down Expand Up @@ -670,20 +670,20 @@ pub enum RdmaOperation {
Read,
}

impl From<RdmaOperation> for rdmacore_sys::ibv_wr_opcode::Type {
impl From<RdmaOperation> for rdmaxcel_sys::ibv_wr_opcode::Type {
fn from(op: RdmaOperation) -> Self {
match op {
RdmaOperation::Write => rdmacore_sys::ibv_wr_opcode::IBV_WR_RDMA_WRITE,
RdmaOperation::Read => rdmacore_sys::ibv_wr_opcode::IBV_WR_RDMA_READ,
RdmaOperation::Write => rdmaxcel_sys::ibv_wr_opcode::IBV_WR_RDMA_WRITE,
RdmaOperation::Read => rdmaxcel_sys::ibv_wr_opcode::IBV_WR_RDMA_READ,
}
}
}

impl From<rdmacore_sys::ibv_wc_opcode::Type> for RdmaOperation {
fn from(op: rdmacore_sys::ibv_wc_opcode::Type) -> Self {
impl From<rdmaxcel_sys::ibv_wc_opcode::Type> for RdmaOperation {
fn from(op: rdmaxcel_sys::ibv_wc_opcode::Type) -> Self {
match op {
rdmacore_sys::ibv_wc_opcode::IBV_WC_RDMA_WRITE => RdmaOperation::Write,
rdmacore_sys::ibv_wc_opcode::IBV_WC_RDMA_READ => RdmaOperation::Read,
rdmaxcel_sys::ibv_wc_opcode::IBV_WC_RDMA_WRITE => RdmaOperation::Write,
rdmaxcel_sys::ibv_wc_opcode::IBV_WC_RDMA_READ => RdmaOperation::Read,
_ => panic!("Unsupported operation type"),
}
}
Expand Down Expand Up @@ -718,7 +718,7 @@ impl std::fmt::Debug for RdmaQpInfo {

/// Wrapper around ibv_wc (ibverbs work completion).
///
/// This exposes only the public fields of rdmacore_sys::ibv_wc, allowing us to more easily
/// This exposes only the public fields of rdmaxcel_sys::ibv_wc, allowing us to more easily
/// interact with it from Rust. Work completions are used to track the status of
/// RDMA operations and are generated when an operation completes.
#[derive(Debug, Named, Clone, serde::Serialize, serde::Deserialize)]
Expand All @@ -730,9 +730,9 @@ pub struct IbvWc {
/// `valid` - Whether the work completion is valid
valid: bool,
/// `error` - Error information if the operation failed
error: Option<(rdmacore_sys::ibv_wc_status::Type, u32)>,
error: Option<(rdmaxcel_sys::ibv_wc_status::Type, u32)>,
/// `opcode` - Type of operation that completed (read, write, etc.)
opcode: rdmacore_sys::ibv_wc_opcode::Type,
opcode: rdmaxcel_sys::ibv_wc_opcode::Type,
/// `bytes` - Immediate data (if any)
bytes: Option<u32>,
/// `qp_num` - Queue Pair Number
Expand All @@ -749,8 +749,8 @@ pub struct IbvWc {
dlid_path_bits: u8,
}

impl From<rdmacore_sys::ibv_wc> for IbvWc {
fn from(wc: rdmacore_sys::ibv_wc) -> Self {
impl From<rdmaxcel_sys::ibv_wc> for IbvWc {
fn from(wc: rdmaxcel_sys::ibv_wc) -> Self {
IbvWc {
wr_id: wc.wr_id(),
len: wc.len(),
Expand Down Expand Up @@ -862,21 +862,21 @@ mod tests {
#[test]
fn test_rdma_operation_conversion() {
assert_eq!(
rdmacore_sys::ibv_wr_opcode::IBV_WR_RDMA_WRITE,
rdmacore_sys::ibv_wr_opcode::Type::from(RdmaOperation::Write)
rdmaxcel_sys::ibv_wr_opcode::IBV_WR_RDMA_WRITE,
rdmaxcel_sys::ibv_wr_opcode::Type::from(RdmaOperation::Write)
);
assert_eq!(
rdmacore_sys::ibv_wr_opcode::IBV_WR_RDMA_READ,
rdmacore_sys::ibv_wr_opcode::Type::from(RdmaOperation::Read)
rdmaxcel_sys::ibv_wr_opcode::IBV_WR_RDMA_READ,
rdmaxcel_sys::ibv_wr_opcode::Type::from(RdmaOperation::Read)
);

assert_eq!(
RdmaOperation::Write,
RdmaOperation::from(rdmacore_sys::ibv_wc_opcode::IBV_WC_RDMA_WRITE)
RdmaOperation::from(rdmaxcel_sys::ibv_wc_opcode::IBV_WC_RDMA_WRITE)
);
assert_eq!(
RdmaOperation::Read,
RdmaOperation::from(rdmacore_sys::ibv_wc_opcode::IBV_WC_RDMA_READ)
RdmaOperation::from(rdmaxcel_sys::ibv_wc_opcode::IBV_WC_RDMA_READ)
);
}

Expand All @@ -897,18 +897,18 @@ mod tests {

#[test]
fn test_ibv_wc() {
let mut wc = rdmacore_sys::ibv_wc::default();
let mut wc = rdmaxcel_sys::ibv_wc::default();

// SAFETY: modifies private fields through pointer manipulation
unsafe {
// Cast to pointer and modify the fields directly
let wc_ptr = &mut wc as *mut rdmacore_sys::ibv_wc as *mut u8;
let wc_ptr = &mut wc as *mut rdmaxcel_sys::ibv_wc as *mut u8;

// Set wr_id (at offset 0, u64)
*(wc_ptr as *mut u64) = 42;

// Set status to SUCCESS (at offset 8, u32)
*(wc_ptr.add(8) as *mut i32) = rdmacore_sys::ibv_wc_status::IBV_WC_SUCCESS as i32;
*(wc_ptr.add(8) as *mut i32) = rdmaxcel_sys::ibv_wc_status::IBV_WC_SUCCESS as i32;
}
let ibv_wc = IbvWc::from(wc);
assert_eq!(ibv_wc.wr_id(), 42);
Expand Down
Loading