Skip to content

Commit 70ab9c6

Browse files
committed
First bits working
1 parent 8f8fa56 commit 70ab9c6

File tree

9 files changed

+189
-11
lines changed

9 files changed

+189
-11
lines changed

CMakeLists.txt

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,22 @@ if (POLICY CMP0074)
66
endif ()
77

88
project(marian CXX C)
9+
10+
######### ARCH DETECTION #########
11+
# Architecture detection
12+
include(TargetArch)
13+
14+
target_architecture(CMAKE_TARGET_ARCHITECTURES)
15+
list(LENGTH CMAKE_TARGET_ARCHITECTURES cmake_target_arch_len)
16+
if(NOT "${cmake_target_arch_len}" STREQUAL "1")
17+
set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL TRUE)
18+
set(CMAKE_TARGET_ARCHITECTURE_CODE "universal")
19+
else()
20+
set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL FALSE)
21+
set(CMAKE_TARGET_ARCHITECTURE_CODE "${CMAKE_TARGET_ARCHITECTURES}")
22+
endif()
23+
######### ARCH DETECTION #########
24+
925
set(CMAKE_CXX_STANDARD 17)
1026
set(CMAKE_CXX_STANDARD_REQUIRED ON)
1127
set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")
@@ -100,6 +116,15 @@ if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm")
100116
# Some warnings as errors. I don't feel comfortable about the strict aliasing.
101117
set(ARM_WARNINGS "-fno-strict-aliasing -Wno-comment")
102118

119+
set(USE_SIMD_UTILS ON)
120+
# @TODO this assumes ArmV8. We should also look at armv7
121+
add_compile_definitions(ARM FMA SSE) #added for ARM
122+
if(MSVC)
123+
add_compile_options(/flax-vector-conversions)
124+
else(MSVC)
125+
add_compile_options(-flax-vector-conversions)
126+
endif(MSVC)
127+
103128
endif(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm")
104129

105130

@@ -534,7 +559,7 @@ endif()
534559
###############################################################################
535560
# Find BLAS library
536561
if(COMPILE_CPU)
537-
if(NOT GENERATE_MARIAN_INSTALL_TARGETS)
562+
if(NOT GENERATE_MARIAN_INSTALL_TARGETS AND NOT ARM)
538563
set(EXT_LIBS ${EXT_LIBS} intgemm) # Enable intgemm when compiling CPU
539564
add_definitions(-DCOMPILE_CPU=1)
540565
endif()

cmake/TargetArch.cmake

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
# Modified from https://github.com/axr/solar-cmake/blob/73cfea0db0284c5e2010aca23989046e5bda95c9/Solar.cmake
2+
# Based on the Qt 5 processor detection code, so should be very accurate
3+
# https://qt.gitorious.org/qt/qtbase/blobs/master/src/corelib/global/qprocessordetection.h
4+
# Currently handles arm (v5, v6, v7), x86 (32/64), ia64, and ppc (32/64)
5+
6+
# Regarding POWER/PowerPC, just as is noted in the Qt source,
7+
# "There are many more known variants/revisions that we do not handle/detect."
8+
9+
set(archdetect_c_code "
10+
#if defined(__arm__) || defined(__TARGET_ARCH_ARM) || defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || defined(__ARM64__)
11+
#if defined(__ARM_ARCH_8__) || defined(__ARM_ARCH_8) \\
12+
|| defined(__ARM_ARCH_8A__) || defined(__ARM_ARCH_8A) \\
13+
|| defined(__ARM_ARCH_8R__) || defined(__ARM_ARCH_8R) \\
14+
|| defined(__ARM_ARCH_8M__) || defined(__ARM_ARCH_8M) \\
15+
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 8)
16+
#error cmake_ARCH armv8
17+
#elif defined(__ARM_ARCH_7__) \\
18+
|| defined(__ARM_ARCH_7A__) \\
19+
|| defined(__ARM_ARCH_7R__) \\
20+
|| defined(__ARM_ARCH_7M__) \\
21+
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
22+
#error cmake_ARCH armv7
23+
#elif defined(__ARM_ARCH_6__) \\
24+
|| defined(__ARM_ARCH_6J__) \\
25+
|| defined(__ARM_ARCH_6T2__) \\
26+
|| defined(__ARM_ARCH_6Z__) \\
27+
|| defined(__ARM_ARCH_6K__) \\
28+
|| defined(__ARM_ARCH_6ZK__) \\
29+
|| defined(__ARM_ARCH_6M__) \\
30+
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
31+
#error cmake_ARCH armv6
32+
#elif defined(__ARM_ARCH_5TEJ__) \\
33+
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5)
34+
#error cmake_ARCH armv5
35+
#else
36+
#error cmake_ARCH arm
37+
#endif
38+
#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
39+
#error cmake_ARCH i386
40+
#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
41+
#error cmake_ARCH x86_64
42+
#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
43+
#error cmake_ARCH ia64
44+
#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\
45+
|| defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC) \\
46+
|| defined(_M_MPPC) || defined(_M_PPC)
47+
#if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
48+
#error cmake_ARCH ppc64
49+
#else
50+
#error cmake_ARCH ppc
51+
#endif
52+
#endif
53+
54+
#error cmake_ARCH unknown
55+
")
56+
57+
58+
# Set ppc_support to TRUE before including this file or ppc and ppc64
59+
# will be treated as invalid architectures since they are no longer supported by Apple
60+
61+
function(target_architecture output_var)
62+
if(APPLE AND CMAKE_OSX_ARCHITECTURES)
63+
# On OS X we use CMAKE_OSX_ARCHITECTURES *if* it was set
64+
# First let's normalize the order of the values
65+
66+
# Note that it's not possible to compile PowerPC applications if you are using
67+
# the OS X SDK version 10.6 or later - you'll need 10.4/10.5 for that, so we
68+
# disable it by default
69+
# See this page for more information:
70+
# http://stackoverflow.com/questions/5333490/how-can-we-restore-ppc-ppc64-as-well-as-full-10-4-10-5-sdk-support-to-xcode-4
71+
72+
# Architecture defaults to i386 or ppc on OS X 10.5 and earlier, depending on the CPU type detected at runtime.
73+
# On OS X 10.6+ the default is x86_64 if the CPU supports it, i386 otherwise.
74+
75+
foreach(osx_arch ${CMAKE_OSX_ARCHITECTURES})
76+
if("${osx_arch}" STREQUAL "ppc" AND ppc_support)
77+
set(osx_arch_ppc TRUE)
78+
elseif("${osx_arch}" STREQUAL "i386")
79+
set(osx_arch_i386 TRUE)
80+
elseif("${osx_arch}" STREQUAL "x86_64")
81+
set(osx_arch_x86_64 TRUE)
82+
elseif("${osx_arch}" STREQUAL "ppc64" AND ppc_support)
83+
set(osx_arch_ppc64 TRUE)
84+
else()
85+
message(FATAL_ERROR "Invalid OS X arch name: ${osx_arch}")
86+
endif()
87+
endforeach()
88+
89+
# Now add all the architectures in our normalized order
90+
if(osx_arch_ppc)
91+
list(APPEND ARCH ppc)
92+
endif()
93+
94+
if(osx_arch_i386)
95+
list(APPEND ARCH i386)
96+
endif()
97+
98+
if(osx_arch_x86_64)
99+
list(APPEND ARCH x86_64)
100+
endif()
101+
102+
if(osx_arch_ppc64)
103+
list(APPEND ARCH ppc64)
104+
endif()
105+
else()
106+
file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}")
107+
108+
enable_language(C)
109+
110+
# Detect the architecture in a rather creative way...
111+
# This compiles a small C program which is a series of ifdefs that selects a
112+
# particular #error preprocessor directive whose message string contains the
113+
# target architecture. The program will always fail to compile (both because
114+
# file is not a valid C program, and obviously because of the presence of the
115+
# #error preprocessor directives... but by exploiting the preprocessor in this
116+
# way, we can detect the correct target architecture even when cross-compiling,
117+
# since the program itself never needs to be run (only the compiler/preprocessor)
118+
try_run(
119+
run_result_unused
120+
compile_result_unused
121+
"${CMAKE_BINARY_DIR}"
122+
"${CMAKE_BINARY_DIR}/arch.c"
123+
COMPILE_OUTPUT_VARIABLE ARCH
124+
CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
125+
)
126+
127+
# Parse the architecture name from the compiler output
128+
string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}")
129+
130+
# Get rid of the value marker leaving just the architecture name
131+
string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}")
132+
133+
# If we are compiling with an unknown architecture this variable should
134+
# already be set to "unknown" but in the case that it's empty (i.e. due
135+
# to a typo in the code), then set it to unknown
136+
if (NOT ARCH)
137+
set(ARCH unknown)
138+
endif()
139+
endif()
140+
141+
set(${output_var} "${ARCH}" PARENT_SCOPE)
142+
endfunction()

src/3rd_party/faiss/VectorTransform.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919

2020
using namespace faiss;
2121

22+
#ifdef ARM
23+
#include "3rd_party/simd_utils/simd_utils.h"
24+
#endif
25+
2226

2327
extern "C" {
2428

src/common/types.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,11 @@
1717
#include <type_traits>
1818

1919
#ifndef __CUDACC__ // NVCC is very unreliable when it comes to CPU intrinsics, we hide them completely from NVCC-compiled code
20-
#include <immintrin.h>
20+
#ifndef ARM
21+
#include <immintrin.h>
22+
#else
23+
#include "3rd_party/simd_utils/simd_utils.h"
24+
#endif
2125
#endif
2226

2327
#ifdef __CUDACC__ // nvcc is compiling this code

src/functional/operators.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,11 @@ struct Ops<double> {
217217
// __CUDACC__ is defined when compiling with NVCC regardless of device type
218218
// __CUDA_ARCH__ is defined when compiling device (GPU) code
219219
#ifndef __CUDACC__
220-
220+
#ifndef ARM
221221
#include "3rd_party/sse_mathfun.h"
222+
#else
223+
#include "3rd_party/simd_utils/simd_utils.h" // @TODO this might be dependent on NEON
224+
#endif
222225

223226
namespace marian {
224227
namespace functional {

src/tensors/cpu/expression_graph_packable.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ class ExpressionGraphPackable : public ExpressionGraph {
152152
#endif
153153
} else if (isIntgemm(gemmElementType) &&
154154
(pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2 /* || pName.find("Wemb") != std::string::npos*/)) {
155-
#if COMPILE_CPU
155+
#if COMPILE_CPU && !defined(ARM)
156156
using cpu::integer::cols;
157157
using cpu::integer::rows;
158158
auto allocator = New<TensorAllocator>(getBackend());

src/tensors/cpu/fbgemm/packed_gemm.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,16 @@
22
#include "tensors/tensor_allocator.h"
33
#include "tensors/tensor_operators.h"
44

5-
#include <emmintrin.h>
6-
#include <immintrin.h>
7-
#include <tmmintrin.h>
8-
#include <xmmintrin.h>
95
#include <cassert>
106
#include <cstddef>
117
#include <unordered_map>
128
//#include <chrono>
139

1410
#if USE_FBGEMM
11+
#include <emmintrin.h>
12+
#include <immintrin.h>
13+
#include <tmmintrin.h>
14+
#include <xmmintrin.h>
1515
#ifdef _MSC_VER
1616
#pragma warning(disable: 4505) // 'fbgemmAlignedAlloc' in fbgemm.h: unreferenced local function has been removed (missing 'static inline')
1717
#pragma warning(disable: 4251) // 'fbgemm::CompressedSparseColumn::colptr_': class 'std::vector<int,std::allocator<_Ty>>' needs to have dll-interface to be used by clients of class 'fbgemm::CompressedSparseColumn'

src/tensors/cpu/intgemm_interface.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ namespace marian {
99
namespace cpu {
1010
namespace integer {
1111

12-
#if COMPILE_CPU
12+
#if COMPILE_CPU && !defined(ARM)
1313
/*
1414
* Prepare an activation matrix into intgemm8/16 format. For now the activation matrix is just quantized.
1515
* Expr input: The input tensor
@@ -45,7 +45,7 @@ static inline Expr prepareA(Expr a) {
4545
*/
4646
template<Type vtype>
4747
static inline Expr affineOrDotTyped(Expr a, Expr bQuant, Expr bias, bool transA, bool /*transB*/, float scale) {
48-
#if COMPILE_CPU
48+
#if COMPILE_CPU && !defined(ARM)
4949
ABORT_IF(!isFloat(a->value_type()), "Intgemm expects type of A to be float32 not {}", a->value_type());
5050
ABORT_IF(!isIntgemm(bQuant->value_type()), "Intgemm expects type of B to be a variant of intgemm not {}", bQuant->value_type());
5151

0 commit comments

Comments
 (0)