Skip to content

Commit 448bb3c

Browse files
jan-wassenbergcopybara-github
authored andcommitted
GCC bug workaround for f16x1, also simplify Broadcast Tx1
PiperOrigin-RevId: 561094779
1 parent 9f216c8 commit 448bb3c

File tree

2 files changed

+36
-35
lines changed

2 files changed

+36
-35
lines changed

hwy/ops/arm_neon-inl.h

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -4768,14 +4768,20 @@ HWY_API VFromD<DH> UpperHalf(DH dh, VFromD<Twice<DH>> v) {
47684768

47694769
// ------------------------------ Broadcast/splat any lane
47704770

4771+
template <int kLane, typename T>
4772+
HWY_API Vec128<T, 1> Broadcast(Vec128<T, 1> v) {
4773+
return v;
4774+
}
4775+
47714776
#if HWY_ARCH_ARM_A64
47724777
// Unsigned
47734778
template <int kLane>
47744779
HWY_API Vec128<uint8_t> Broadcast(Vec128<uint8_t> v) {
47754780
static_assert(0 <= kLane && kLane < 16, "Invalid lane");
47764781
return Vec128<uint8_t>(vdupq_laneq_u8(v.raw, kLane));
47774782
}
4778-
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
4783+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8),
4784+
HWY_IF_LANES_GT(N, 1)>
47794785
HWY_API Vec128<uint8_t, N> Broadcast(Vec128<uint8_t, N> v) {
47804786
static_assert(0 <= kLane && kLane < N, "Invalid lane");
47814787
return Vec128<uint8_t, N>(vdup_lane_u8(v.raw, kLane));
@@ -4785,7 +4791,8 @@ HWY_API Vec128<uint16_t> Broadcast(Vec128<uint16_t> v) {
47854791
static_assert(0 <= kLane && kLane < 8, "Invalid lane");
47864792
return Vec128<uint16_t>(vdupq_laneq_u16(v.raw, kLane));
47874793
}
4788-
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)>
4794+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8),
4795+
HWY_IF_LANES_GT(N, 1)>
47894796
HWY_API Vec128<uint16_t, N> Broadcast(Vec128<uint16_t, N> v) {
47904797
static_assert(0 <= kLane && kLane < N, "Invalid lane");
47914798
return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
@@ -4795,7 +4802,8 @@ HWY_API Vec128<uint32_t> Broadcast(Vec128<uint32_t> v) {
47954802
static_assert(0 <= kLane && kLane < 4, "Invalid lane");
47964803
return Vec128<uint32_t>(vdupq_laneq_u32(v.raw, kLane));
47974804
}
4798-
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
4805+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8),
4806+
HWY_IF_LANES_GT(N, 1)>
47994807
HWY_API Vec128<uint32_t, N> Broadcast(Vec128<uint32_t, N> v) {
48004808
static_assert(0 <= kLane && kLane < N, "Invalid lane");
48014809
return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
@@ -4805,15 +4813,15 @@ HWY_API Vec128<uint64_t> Broadcast(Vec128<uint64_t> v) {
48054813
static_assert(0 <= kLane && kLane < 2, "Invalid lane");
48064814
return Vec128<uint64_t>(vdupq_laneq_u64(v.raw, kLane));
48074815
}
4808-
// Vec64<uint64_t> is defined below.
48094816

48104817
// Signed
48114818
template <int kLane>
48124819
HWY_API Vec128<int8_t> Broadcast(Vec128<int8_t> v) {
48134820
static_assert(0 <= kLane && kLane < 16, "Invalid lane");
48144821
return Vec128<int8_t>(vdupq_laneq_s8(v.raw, kLane));
48154822
}
4816-
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)>
4823+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8),
4824+
HWY_IF_LANES_GT(N, 1)>
48174825
HWY_API Vec128<int8_t, N> Broadcast(Vec128<int8_t, N> v) {
48184826
static_assert(0 <= kLane && kLane < N, "Invalid lane");
48194827
return Vec128<int8_t, N>(vdup_lane_s8(v.raw, kLane));
@@ -4823,7 +4831,8 @@ HWY_API Vec128<int16_t> Broadcast(Vec128<int16_t> v) {
48234831
static_assert(0 <= kLane && kLane < 8, "Invalid lane");
48244832
return Vec128<int16_t>(vdupq_laneq_s16(v.raw, kLane));
48254833
}
4826-
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
4834+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8),
4835+
HWY_IF_LANES_GT(N, 1)>
48274836
HWY_API Vec128<int16_t, N> Broadcast(Vec128<int16_t, N> v) {
48284837
static_assert(0 <= kLane && kLane < N, "Invalid lane");
48294838
return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
@@ -4833,7 +4842,8 @@ HWY_API Vec128<int32_t> Broadcast(Vec128<int32_t> v) {
48334842
static_assert(0 <= kLane && kLane < 4, "Invalid lane");
48344843
return Vec128<int32_t>(vdupq_laneq_s32(v.raw, kLane));
48354844
}
4836-
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
4845+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8),
4846+
HWY_IF_LANES_GT(N, 1)>
48374847
HWY_API Vec128<int32_t, N> Broadcast(Vec128<int32_t, N> v) {
48384848
static_assert(0 <= kLane && kLane < N, "Invalid lane");
48394849
return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
@@ -4843,7 +4853,6 @@ HWY_API Vec128<int64_t> Broadcast(Vec128<int64_t> v) {
48434853
static_assert(0 <= kLane && kLane < 2, "Invalid lane");
48444854
return Vec128<int64_t>(vdupq_laneq_s64(v.raw, kLane));
48454855
}
4846-
// Vec64<int64_t> is defined below.
48474856

48484857
// Float
48494858
#if HWY_HAVE_FLOAT16
@@ -4852,7 +4861,8 @@ HWY_API Vec128<float16_t> Broadcast(Vec128<float16_t> v) {
48524861
static_assert(0 <= kLane && kLane < 8, "Invalid lane");
48534862
return Vec128<float16_t>(vdupq_laneq_f16(v.raw, kLane));
48544863
}
4855-
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8)>
4864+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8),
4865+
HWY_IF_LANES_GT(N, 1)>
48564866
HWY_API Vec128<float16_t, N> Broadcast(Vec128<float16_t, N> v) {
48574867
static_assert(0 <= kLane && kLane < N, "Invalid lane");
48584868
return Vec128<float16_t, N>(vdup_lane_f16(v.raw, kLane));
@@ -4864,7 +4874,8 @@ HWY_API Vec128<float> Broadcast(Vec128<float> v) {
48644874
static_assert(0 <= kLane && kLane < 4, "Invalid lane");
48654875
return Vec128<float>(vdupq_laneq_f32(v.raw, kLane));
48664876
}
4867-
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
4877+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float, N, 8),
4878+
HWY_IF_LANES_GT(N, 1)>
48684879
HWY_API Vec128<float, N> Broadcast(Vec128<float, N> v) {
48694880
static_assert(0 <= kLane && kLane < N, "Invalid lane");
48704881
return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
@@ -4874,11 +4885,6 @@ HWY_API Vec128<double> Broadcast(Vec128<double> v) {
48744885
static_assert(0 <= kLane && kLane < 2, "Invalid lane");
48754886
return Vec128<double>(vdupq_laneq_f64(v.raw, kLane));
48764887
}
4877-
template <int kLane>
4878-
HWY_API Vec64<double> Broadcast(Vec64<double> v) {
4879-
static_assert(0 <= kLane && kLane < 1, "Invalid lane");
4880-
return v;
4881-
}
48824888

48834889
#else // !HWY_ARCH_ARM_A64
48844890
// No vdupq_laneq_* on armv7: use vgetq_lane_* + vdupq_n_*.
@@ -4889,7 +4895,8 @@ HWY_API Vec128<uint8_t> Broadcast(Vec128<uint8_t> v) {
48894895
static_assert(0 <= kLane && kLane < 16, "Invalid lane");
48904896
return Vec128<uint8_t>(vdupq_n_u8(vgetq_lane_u8(v.raw, kLane)));
48914897
}
4892-
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
4898+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8),
4899+
HWY_IF_LANES_GT(N, 1)>
48934900
HWY_API Vec128<uint8_t, N> Broadcast(Vec128<uint8_t, N> v) {
48944901
static_assert(0 <= kLane && kLane < N, "Invalid lane");
48954902
return Vec128<uint8_t, N>(vdup_lane_u8(v.raw, kLane));
@@ -4899,7 +4906,8 @@ HWY_API Vec128<uint16_t> Broadcast(Vec128<uint16_t> v) {
48994906
static_assert(0 <= kLane && kLane < 8, "Invalid lane");
49004907
return Vec128<uint16_t>(vdupq_n_u16(vgetq_lane_u16(v.raw, kLane)));
49014908
}
4902-
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)>
4909+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8),
4910+
HWY_IF_LANES_GT(N, 1)>
49034911
HWY_API Vec128<uint16_t, N> Broadcast(Vec128<uint16_t, N> v) {
49044912
static_assert(0 <= kLane && kLane < N, "Invalid lane");
49054913
return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
@@ -4909,7 +4917,8 @@ HWY_API Vec128<uint32_t> Broadcast(Vec128<uint32_t> v) {
49094917
static_assert(0 <= kLane && kLane < 4, "Invalid lane");
49104918
return Vec128<uint32_t>(vdupq_n_u32(vgetq_lane_u32(v.raw, kLane)));
49114919
}
4912-
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
4920+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8),
4921+
HWY_IF_LANES_GT(N, 1)>
49134922
HWY_API Vec128<uint32_t, N> Broadcast(Vec128<uint32_t, N> v) {
49144923
static_assert(0 <= kLane && kLane < N, "Invalid lane");
49154924
return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
@@ -4919,15 +4928,15 @@ HWY_API Vec128<uint64_t> Broadcast(Vec128<uint64_t> v) {
49194928
static_assert(0 <= kLane && kLane < 2, "Invalid lane");
49204929
return Vec128<uint64_t>(vdupq_n_u64(vgetq_lane_u64(v.raw, kLane)));
49214930
}
4922-
// Vec64<uint64_t> is defined below.
49234931

49244932
// Signed
49254933
template <int kLane>
49264934
HWY_API Vec128<int8_t> Broadcast(Vec128<int8_t> v) {
49274935
static_assert(0 <= kLane && kLane < 16, "Invalid lane");
49284936
return Vec128<int8_t>(vdupq_n_s8(vgetq_lane_s8(v.raw, kLane)));
49294937
}
4930-
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)>
4938+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8),
4939+
HWY_IF_LANES_GT(N, 1)>
49314940
HWY_API Vec128<int8_t, N> Broadcast(Vec128<int8_t, N> v) {
49324941
static_assert(0 <= kLane && kLane < N, "Invalid lane");
49334942
return Vec128<int8_t, N>(vdup_lane_s8(v.raw, kLane));
@@ -4937,7 +4946,8 @@ HWY_API Vec128<int16_t> Broadcast(Vec128<int16_t> v) {
49374946
static_assert(0 <= kLane && kLane < 8, "Invalid lane");
49384947
return Vec128<int16_t>(vdupq_n_s16(vgetq_lane_s16(v.raw, kLane)));
49394948
}
4940-
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
4949+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8),
4950+
HWY_IF_LANES_GT(N, 1)>
49414951
HWY_API Vec128<int16_t, N> Broadcast(Vec128<int16_t, N> v) {
49424952
static_assert(0 <= kLane && kLane < N, "Invalid lane");
49434953
return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
@@ -4947,7 +4957,8 @@ HWY_API Vec128<int32_t> Broadcast(Vec128<int32_t> v) {
49474957
static_assert(0 <= kLane && kLane < 4, "Invalid lane");
49484958
return Vec128<int32_t>(vdupq_n_s32(vgetq_lane_s32(v.raw, kLane)));
49494959
}
4950-
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
4960+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8),
4961+
HWY_IF_LANES_GT(N, 1)>
49514962
HWY_API Vec128<int32_t, N> Broadcast(Vec128<int32_t, N> v) {
49524963
static_assert(0 <= kLane && kLane < N, "Invalid lane");
49534964
return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
@@ -4957,7 +4968,6 @@ HWY_API Vec128<int64_t> Broadcast(Vec128<int64_t> v) {
49574968
static_assert(0 <= kLane && kLane < 2, "Invalid lane");
49584969
return Vec128<int64_t>(vdupq_n_s64(vgetq_lane_s64(v.raw, kLane)));
49594970
}
4960-
// Vec64<int64_t> is defined below.
49614971

49624972
// Float
49634973
#if HWY_HAVE_FLOAT16
@@ -4972,25 +4982,15 @@ HWY_API Vec128<float> Broadcast(Vec128<float> v) {
49724982
static_assert(0 <= kLane && kLane < 4, "Invalid lane");
49734983
return Vec128<float>(vdupq_n_f32(vgetq_lane_f32(v.raw, kLane)));
49744984
}
4975-
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
4985+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float, N, 8),
4986+
HWY_IF_LANES_GT(N, 1)>
49764987
HWY_API Vec128<float, N> Broadcast(Vec128<float, N> v) {
49774988
static_assert(0 <= kLane && kLane < N, "Invalid lane");
49784989
return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
49794990
}
49804991

49814992
#endif // HWY_ARCH_ARM_A64
49824993

4983-
template <int kLane>
4984-
HWY_API Vec64<uint64_t> Broadcast(Vec64<uint64_t> v) {
4985-
static_assert(0 <= kLane && kLane < 1, "Invalid lane");
4986-
return v;
4987-
}
4988-
template <int kLane>
4989-
HWY_API Vec64<int64_t> Broadcast(Vec64<int64_t> v) {
4990-
static_assert(0 <= kLane && kLane < 1, "Invalid lane");
4991-
return v;
4992-
}
4993-
49944994
// ------------------------------ TableLookupLanes
49954995

49964996
// Returned by SetTableIndices for use by TableLookupLanes.

hwy/tests/blockwise_test.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ struct TestBroadcastR {
4343
for (size_t block = 0; block < N; block += blockN) {
4444
in_lanes[block + kLane] = static_cast<T>(block + 1);
4545
}
46+
PreventElision(in_lanes[0]); // workaround for f16x1 failure
4647
const auto in = Load(d, in_lanes.get());
4748
for (size_t block = 0; block < N; block += blockN) {
4849
for (size_t i = 0; i < blockN; ++i) {

0 commit comments

Comments
 (0)