First bits working

XapaJIaMnu · XapaJIaMnu · commit 70ab9c6db53f · 2023-09-15T16:30:18.000Z
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -6,6 +6,22 @@ if (POLICY CMP0074)
 endif ()
 
 project(marian CXX C)
+
+######### ARCH DETECTION #########
+# Architecture detection
+include(TargetArch)
+
+target_architecture(CMAKE_TARGET_ARCHITECTURES)
+list(LENGTH CMAKE_TARGET_ARCHITECTURES cmake_target_arch_len)
+if(NOT "${cmake_target_arch_len}" STREQUAL "1")
+    set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL TRUE)
+    set(CMAKE_TARGET_ARCHITECTURE_CODE "universal")
+else()
+    set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL FALSE)
+    set(CMAKE_TARGET_ARCHITECTURE_CODE "${CMAKE_TARGET_ARCHITECTURES}")
+endif()
+######### ARCH DETECTION #########
+
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")
@@ -100,6 +116,15 @@ if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm")
   # Some warnings as errors. I don't feel comfortable about the strict aliasing.
   set(ARM_WARNINGS "-fno-strict-aliasing -Wno-comment")
 
+  set(USE_SIMD_UTILS ON)
+  # @TODO this assumes ArmV8. We should also look at armv7
+  add_compile_definitions(ARM FMA SSE) #added for ARM
+  if(MSVC)
+    add_compile_options(/flax-vector-conversions)
+  else(MSVC)
+    add_compile_options(-flax-vector-conversions)
+  endif(MSVC)
+
 endif(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm")
 
 
@@ -534,7 +559,7 @@ endif()
 ###############################################################################
 # Find BLAS library
 if(COMPILE_CPU)
-  if(NOT GENERATE_MARIAN_INSTALL_TARGETS)
+  if(NOT GENERATE_MARIAN_INSTALL_TARGETS AND NOT ARM)
     set(EXT_LIBS ${EXT_LIBS} intgemm) # Enable intgemm when compiling CPU
     add_definitions(-DCOMPILE_CPU=1)
   endif()
diff --git a/cmake/TargetArch.cmake b/cmake/TargetArch.cmake
@@ -0,0 +1,142 @@
+# Modified from https://github.com/axr/solar-cmake/blob/73cfea0db0284c5e2010aca23989046e5bda95c9/Solar.cmake
+# Based on the Qt 5 processor detection code, so should be very accurate
+# https://qt.gitorious.org/qt/qtbase/blobs/master/src/corelib/global/qprocessordetection.h
+# Currently handles arm (v5, v6, v7), x86 (32/64), ia64, and ppc (32/64)
+
+# Regarding POWER/PowerPC, just as is noted in the Qt source,
+# "There are many more known variants/revisions that we do not handle/detect."
+
+set(archdetect_c_code "
+#if defined(__arm__) || defined(__TARGET_ARCH_ARM)  || defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || defined(__ARM64__)
+    #if defined(__ARM_ARCH_8__)     || defined(__ARM_ARCH_8)          \\
+        || defined(__ARM_ARCH_8A__) || defined(__ARM_ARCH_8A)         \\
+        || defined(__ARM_ARCH_8R__) || defined(__ARM_ARCH_8R)         \\
+        || defined(__ARM_ARCH_8M__) || defined(__ARM_ARCH_8M)         \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 8)
+        #error cmake_ARCH armv8
+    #elif defined(__ARM_ARCH_7__)                                     \\
+        || defined(__ARM_ARCH_7A__)                                   \\
+        || defined(__ARM_ARCH_7R__)                                   \\
+        || defined(__ARM_ARCH_7M__)                                   \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
+        #error cmake_ARCH armv7
+    #elif defined(__ARM_ARCH_6__)                                      \\
+        || defined(__ARM_ARCH_6J__)                                    \\
+        || defined(__ARM_ARCH_6T2__)                                   \\
+        || defined(__ARM_ARCH_6Z__)                                    \\
+        || defined(__ARM_ARCH_6K__)                                    \\
+        || defined(__ARM_ARCH_6ZK__)                                   \\
+        || defined(__ARM_ARCH_6M__)                                    \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
+        #error cmake_ARCH armv6
+    #elif defined(__ARM_ARCH_5TEJ__) \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5)
+        #error cmake_ARCH armv5
+    #else
+        #error cmake_ARCH arm
+    #endif
+#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
+    #error cmake_ARCH i386
+#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
+    #error cmake_ARCH x86_64
+#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
+    #error cmake_ARCH ia64
+#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\
+      || defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC)  \\
+      || defined(_M_MPPC) || defined(_M_PPC)
+    #if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
+        #error cmake_ARCH ppc64
+    #else
+        #error cmake_ARCH ppc
+    #endif
+#endif
+
+#error cmake_ARCH unknown
+")
+
+
+# Set ppc_support to TRUE before including this file or ppc and ppc64
+# will be treated as invalid architectures since they are no longer supported by Apple
+
+function(target_architecture output_var)
+    if(APPLE AND CMAKE_OSX_ARCHITECTURES)
+        # On OS X we use CMAKE_OSX_ARCHITECTURES *if* it was set
+        # First let's normalize the order of the values
+
+        # Note that it's not possible to compile PowerPC applications if you are using
+        # the OS X SDK version 10.6 or later - you'll need 10.4/10.5 for that, so we
+        # disable it by default
+        # See this page for more information:
+        # http://stackoverflow.com/questions/5333490/how-can-we-restore-ppc-ppc64-as-well-as-full-10-4-10-5-sdk-support-to-xcode-4
+
+        # Architecture defaults to i386 or ppc on OS X 10.5 and earlier, depending on the CPU type detected at runtime.
+        # On OS X 10.6+ the default is x86_64 if the CPU supports it, i386 otherwise.
+
+        foreach(osx_arch ${CMAKE_OSX_ARCHITECTURES})
+            if("${osx_arch}" STREQUAL "ppc" AND ppc_support)
+                set(osx_arch_ppc TRUE)
+            elseif("${osx_arch}" STREQUAL "i386")
+                set(osx_arch_i386 TRUE)
+            elseif("${osx_arch}" STREQUAL "x86_64")
+                set(osx_arch_x86_64 TRUE)
+            elseif("${osx_arch}" STREQUAL "ppc64" AND ppc_support)
+                set(osx_arch_ppc64 TRUE)
+            else()
+                message(FATAL_ERROR "Invalid OS X arch name: ${osx_arch}")
+            endif()
+        endforeach()
+
+        # Now add all the architectures in our normalized order
+        if(osx_arch_ppc)
+            list(APPEND ARCH ppc)
+        endif()
+
+        if(osx_arch_i386)
+            list(APPEND ARCH i386)
+        endif()
+
+        if(osx_arch_x86_64)
+            list(APPEND ARCH x86_64)
+        endif()
+
+        if(osx_arch_ppc64)
+            list(APPEND ARCH ppc64)
+        endif()
+    else()
+        file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}")
+
+        enable_language(C)
+
+        # Detect the architecture in a rather creative way...
+        # This compiles a small C program which is a series of ifdefs that selects a
+        # particular #error preprocessor directive whose message string contains the
+        # target architecture. The program will always fail to compile (both because
+        # file is not a valid C program, and obviously because of the presence of the
+        # #error preprocessor directives... but by exploiting the preprocessor in this
+        # way, we can detect the correct target architecture even when cross-compiling,
+        # since the program itself never needs to be run (only the compiler/preprocessor)
+        try_run(
+            run_result_unused
+            compile_result_unused
+            "${CMAKE_BINARY_DIR}"
+            "${CMAKE_BINARY_DIR}/arch.c"
+            COMPILE_OUTPUT_VARIABLE ARCH
+            CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
+        )
+
+        # Parse the architecture name from the compiler output
+        string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}")
+
+        # Get rid of the value marker leaving just the architecture name
+        string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}")
+
+        # If we are compiling with an unknown architecture this variable should
+        # already be set to "unknown" but in the case that it's empty (i.e. due
+        # to a typo in the code), then set it to unknown
+        if (NOT ARCH)
+            set(ARCH unknown)
+        endif()
+    endif()
+
+    set(${output_var} "${ARCH}" PARENT_SCOPE)
+endfunction()
diff --git a/src/3rd_party/faiss/VectorTransform.cpp b/src/3rd_party/faiss/VectorTransform.cpp
@@ -19,6 +19,10 @@
 
 using namespace faiss;
 
+#ifdef ARM
+#include "3rd_party/simd_utils/simd_utils.h"
+#endif
+
 
 extern "C" {
 
diff --git a/src/3rd_party/simd_utils b/src/3rd_party/simd_utils
@@ -1 +1 @@
-Subproject commit c092ef9dd406cd9b9d54da1ff30cc86c39b4c0a5
+Subproject commit 6960362584481c977cdae9f6a8f7061a37c766cb
diff --git a/src/common/types.h b/src/common/types.h
@@ -17,7 +17,11 @@
 #include <type_traits>
 
 #ifndef __CUDACC__ // NVCC is very unreliable when it comes to CPU intrinsics, we hide them completely from NVCC-compiled code
-#include <immintrin.h>
+  #ifndef ARM
+    #include <immintrin.h>
+  #else
+    #include "3rd_party/simd_utils/simd_utils.h"
+  #endif
 #endif
 
 #ifdef __CUDACC__ // nvcc is compiling this code
diff --git a/src/functional/operators.h b/src/functional/operators.h
@@ -217,8 +217,11 @@ struct Ops<double> {
 // __CUDACC__ is defined when compiling with NVCC regardless of device type
 // __CUDA_ARCH__ is defined when compiling device (GPU) code
 #ifndef __CUDACC__
-
+#ifndef ARM
 #include "3rd_party/sse_mathfun.h"
+#else
+#include "3rd_party/simd_utils/simd_utils.h" // @TODO this might be dependent on NEON
+#endif
 
 namespace marian {
 namespace functional {
diff --git a/src/tensors/cpu/expression_graph_packable.h b/src/tensors/cpu/expression_graph_packable.h
@@ -152,7 +152,7 @@ class ExpressionGraphPackable : public ExpressionGraph {
 #endif
       } else if (isIntgemm(gemmElementType) &&
       (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2 /* || pName.find("Wemb") != std::string::npos*/)) {
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
         using cpu::integer::cols;
         using cpu::integer::rows;
         auto allocator = New<TensorAllocator>(getBackend());
diff --git a/src/tensors/cpu/fbgemm/packed_gemm.cpp b/src/tensors/cpu/fbgemm/packed_gemm.cpp
@@ -2,16 +2,16 @@
 #include "tensors/tensor_allocator.h"
 #include "tensors/tensor_operators.h"
 
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <tmmintrin.h>
-#include <xmmintrin.h>
 #include <cassert>
 #include <cstddef>
 #include <unordered_map>
 //#include <chrono>
 
 #if USE_FBGEMM
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
 #ifdef _MSC_VER
 #pragma warning(disable: 4505) // 'fbgemmAlignedAlloc' in fbgemm.h: unreferenced local function has been removed (missing 'static inline')
 #pragma warning(disable: 4251) // 'fbgemm::CompressedSparseColumn::colptr_': class 'std::vector<int,std::allocator<_Ty>>' needs to have dll-interface to be used by clients of class 'fbgemm::CompressedSparseColumn'
diff --git a/src/tensors/cpu/intgemm_interface.h b/src/tensors/cpu/intgemm_interface.h
@@ -9,7 +9,7 @@ namespace marian {
 namespace cpu {
 namespace integer {
 
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
 /*
  * Prepare an activation matrix into intgemm8/16 format. For now the activation matrix is just quantized.
  * Expr input: The input tensor
@@ -45,7 +45,7 @@ static inline Expr prepareA(Expr a) {
  */
 template<Type vtype>
 static inline Expr affineOrDotTyped(Expr a, Expr bQuant, Expr bias, bool transA, bool /*transB*/, float scale) {
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
   ABORT_IF(!isFloat(a->value_type()), "Intgemm expects type of A to be float32 not {}", a->value_type());
   ABORT_IF(!isIntgemm(bQuant->value_type()), "Intgemm expects type of B to be a variant of intgemm not {}", bQuant->value_type());