From d442d512e8d403c5ebcddd0a7de4c6677bac6126 Mon Sep 17 00:00:00 2001 From: gbaraldi Date: Thu, 3 Jul 2025 12:11:43 -0300 Subject: [PATCH 1/2] Actually setup jit targets when compiling packageimages instead of targeting only one (#54471) --- src/codegen.cpp | 5 ++- src/llvm-multiversioning.cpp | 1 + src/processor_arm.cpp | 51 ++++++++++++++++++++++-- src/processor_fallback.cpp | 21 ++++++++-- src/processor_x86.cpp | 77 ++++++++++++++++++++++++++++++++++-- 5 files changed, 144 insertions(+), 11 deletions(-) diff --git a/src/codegen.cpp b/src/codegen.cpp index 53a278d42d102..206b35e9bb30d 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -6552,8 +6552,11 @@ static Function* gen_cfun_wrapper( ctx.builder.ClearInsertionPoint(); if (aliasname) { - GlobalAlias::create(cw->getValueType(), cw->getType()->getAddressSpace(), + auto alias = GlobalAlias::create(cw->getValueType(), cw->getType()->getAddressSpace(), GlobalValue::ExternalLinkage, aliasname, cw, M); + if(ctx.emission_context.TargetTriple.isOSBinFormatCOFF()) { + alias->setDLLStorageClass(GlobalValue::DLLStorageClassTypes::DLLExportStorageClass); + } } if (nest) { diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index accdef0aaaa83..4324f324aba61 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -674,6 +674,7 @@ void CloneCtx::rewrite_alias(GlobalAlias *alias, Function *F) trampoline->removeFnAttr("julia.mv.reloc"); trampoline->removeFnAttr("julia.mv.clones"); trampoline->addFnAttr("julia.mv.alias"); + trampoline->setDLLStorageClass(alias->getDLLStorageClass()); alias->eraseFromParent(); uint32_t id; diff --git a/src/processor_arm.cpp b/src/processor_arm.cpp index 0018d2ec925d9..2c9315ec54eef 100644 --- a/src/processor_arm.cpp +++ b/src/processor_arm.cpp @@ -1871,12 +1871,55 @@ const std::pair &jl_get_llvm_disasm_target(void) return res; } +#ifndef __clang_gcanalyzer__ std::vector jl_get_llvm_clone_targets(void) { - if (jit_targets.empty()) - jl_error("JIT targets not initialized"); +auto &cmdline = get_cmdline_targets(); + check_cmdline(cmdline, true); + llvm::SmallVector, 0> image_targets; + for (auto &arg: cmdline) { + auto data = arg_target_data(arg, image_targets.empty()); + image_targets.push_back(std::move(data)); + } + auto ntargets = image_targets.size(); + if (image_targets.empty()) + jl_error("No targets specified"); std::vector res; - for (auto &target: jit_targets) { + // Now decide the clone condition. + for (size_t i = 1; i < ntargets; i++) { + auto &t = image_targets[i]; + if (t.en.flags & JL_TARGET_CLONE_ALL) + continue; + auto &features0 = image_targets[t.base].en.features; + // Always clone when code checks CPU features + t.en.flags |= JL_TARGET_CLONE_CPU; + static constexpr uint32_t clone_fp16[] = {Feature::fp16fml,Feature::fullfp16}; + for (auto fe: clone_fp16) { + if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { + t.en.flags |= JL_TARGET_CLONE_FLOAT16; + break; + } + } + // The most useful one in general... + t.en.flags |= JL_TARGET_CLONE_LOOP; +#ifdef _CPU_ARM_ + static constexpr uint32_t clone_math[] = {Feature::vfp3, Feature::vfp4, Feature::neon}; + for (auto fe: clone_math) { + if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { + t.en.flags |= JL_TARGET_CLONE_MATH; + break; + } + } + static constexpr uint32_t clone_simd[] = {Feature::neon}; + for (auto fe: clone_simd) { + if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { + t.en.flags |= JL_TARGET_CLONE_SIMD; + break; + } + } +#endif + } + for (auto &target: image_targets) { auto features_en = target.en.features; auto features_dis = target.dis.features; for (auto &fename: feature_names) { @@ -1896,6 +1939,8 @@ std::vector jl_get_llvm_clone_targets(void) } return res; } +#endif + extern "C" int jl_test_cpu_feature(jl_cpu_feature_t feature) { diff --git a/src/processor_fallback.cpp b/src/processor_fallback.cpp index 833cd02b5fdfc..d5c80219bd96a 100644 --- a/src/processor_fallback.cpp +++ b/src/processor_fallback.cpp @@ -145,12 +145,26 @@ const std::pair &jl_get_llvm_disasm_target(void) return res; } +#ifndef __clang_gcanalyzer__ extern "C" std::vector jl_get_llvm_clone_targets(void) { - if (jit_targets.empty()) - jl_error("JIT targets not initialized"); + auto &cmdline = get_cmdline_targets(); + check_cmdline(cmdline, true); + llvm::SmallVector, 0> image_targets; + for (auto &arg: cmdline) { + auto data = arg_target_data(arg, image_targets.empty()); + image_targets.push_back(std::move(data)); + } + auto ntargets = image_targets.size(); + // Now decide the clone condition. + for (size_t i = 1; i < ntargets; i++) { + auto &t = image_targets[i]; + t.en.flags |= JL_TARGET_CLONE_ALL; + } + if (image_targets.empty()) + jl_error("No image targets found"); std::vector res; - for (auto &target: jit_targets) { + for (auto &target: image_targets) { jl_target_spec_t ele; std::tie(ele.cpu_name, ele.cpu_features) = get_llvm_target_str(target); ele.data = serialize_target_data(target.name, target.en.features, @@ -161,6 +175,7 @@ extern "C" std::vector jl_get_llvm_clone_targets(void) } return res; } +#endif JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void) { diff --git a/src/processor_x86.cpp b/src/processor_x86.cpp index d96e2061ee674..0f69f310476f8 100644 --- a/src/processor_x86.cpp +++ b/src/processor_x86.cpp @@ -1095,13 +1095,81 @@ extern "C" JL_DLLEXPORT const std::pair &jl_get_llvm_di {feature_masks, 0}, {{}, 0}, 0}); return res; } - +#ifndef __clang_gcanalyzer__ extern "C" JL_DLLEXPORT std::vector jl_get_llvm_clone_targets(void) { - if (jit_targets.empty()) - jl_error("JIT targets not initialized"); + auto &cmdline = get_cmdline_targets(); + check_cmdline(cmdline, true); + llvm::SmallVector, 0> image_targets; + for (auto &arg: cmdline) { + auto data = arg_target_data(arg, image_targets.empty()); + image_targets.push_back(std::move(data)); + } + + auto ntargets = image_targets.size(); + // Now decide the clone condition. + for (size_t i = 1; i < ntargets; i++) { + auto &t = image_targets[i]; + if (t.en.flags & JL_TARGET_CLONE_ALL) + continue; + // Always clone when code checks CPU features + t.en.flags |= JL_TARGET_CLONE_CPU; + // The most useful one in general... + t.en.flags |= JL_TARGET_CLONE_LOOP; + auto &features0 = image_targets[t.base].en.features; + // Special case for KNL/KNM since they're so different + if (!(t.dis.flags & JL_TARGET_CLONE_ALL)) { + if ((t.name == "knl" || t.name == "knm") && + image_targets[t.base].name != "knl" && image_targets[t.base].name != "knm") { + t.en.flags |= JL_TARGET_CLONE_ALL; + break; + } + } + static constexpr uint32_t clone_math[] = {Feature::fma, Feature::fma4}; + static constexpr uint32_t clone_simd[] = {Feature::sse3, Feature::ssse3, + Feature::sse41, Feature::sse42, + Feature::avx, Feature::avx2, + Feature::vaes, Feature::vpclmulqdq, + Feature::sse4a, Feature::avx512f, + Feature::avx512dq, Feature::avx512ifma, + Feature::avx512pf, Feature::avx512er, + Feature::avx512cd, Feature::avx512bw, + Feature::avx512vl, Feature::avx512vbmi, + Feature::avx512vpopcntdq, Feature::avxvnni, + Feature::avx512vbmi2, Feature::avx512vnni, + Feature::avx512bitalg, Feature::avx512bf16, + Feature::avx512vp2intersect, Feature::avx512fp16}; + for (auto fe: clone_math) { + if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { + t.en.flags |= JL_TARGET_CLONE_MATH; + break; + } + } + for (auto fe: clone_simd) { + if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { + t.en.flags |= JL_TARGET_CLONE_SIMD; + break; + } + } + static constexpr uint32_t clone_fp16[] = {Feature::avx512fp16}; + for (auto fe: clone_fp16) { + if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { + t.en.flags |= JL_TARGET_CLONE_FLOAT16; + break; + } + } + static constexpr uint32_t clone_bf16[] = {Feature::avx512bf16}; + for (auto fe: clone_bf16) { + if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { + t.en.flags |= JL_TARGET_CLONE_BFLOAT16; + break; + } + } + } + if (image_targets.empty()) + jl_error("No targets specified"); std::vector res; - for (auto &target: jit_targets) { + for (auto &target: image_targets) { auto features_en = target.en.features; auto features_dis = target.dis.features; for (auto &fename: feature_names) { @@ -1121,6 +1189,7 @@ extern "C" JL_DLLEXPORT std::vector jl_get_llvm_clone_targets( } return res; } +#endif extern "C" int jl_test_cpu_feature(jl_cpu_feature_t feature) { From 6bdda6138eb5f491444f9cc5bfeb6049526f7493 Mon Sep 17 00:00:00 2001 From: K Pamnany Date: Thu, 3 Jul 2025 11:38:18 -0400 Subject: [PATCH 2/2] Drop BFLOAT16-related feature testing/support --- src/processor_x86.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/processor_x86.cpp b/src/processor_x86.cpp index 0f69f310476f8..56ced0e353e6a 100644 --- a/src/processor_x86.cpp +++ b/src/processor_x86.cpp @@ -1158,13 +1158,6 @@ extern "C" JL_DLLEXPORT std::vector jl_get_llvm_clone_targets( break; } } - static constexpr uint32_t clone_bf16[] = {Feature::avx512bf16}; - for (auto fe: clone_bf16) { - if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { - t.en.flags |= JL_TARGET_CLONE_BFLOAT16; - break; - } - } } if (image_targets.empty()) jl_error("No targets specified");