Skip to content
110 changes: 110 additions & 0 deletions signal/micro/kernels/hexagon/hexagon_square_root.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

.section .note.GNU-stack,"",@progbits

// SignalHexagonSqrt32
// input: R0 unsigned 32-bit
// output: R0 unsigned 32-bit
//
// The assembly routine below implements the following:
//
// uint16_t Sqrt32(uint32_t num) {
// uint32_t res = 0;
// uint32_t bit = ((int32_t)1) << 30U;
// while (bit > num)
// bit >>= 2;
// while (bit != 0) {
// if (num >= res + bit) {
// num -= res + bit;
// res = (res >> 1U) + bit;
// } else {
// res >>= 1U;
// }
// bit >>= 2U;
// }
// // Do rounding
// if (num > res && num != 0xFFFF)
// ++res;
// return res;
// }

.text
.p2align 2
.p2align 4,,15
.globl SignalHexagonSqrt32
.type SignalHexagonSqrt32, @function

// Register mnemonics
#define num R0 // input - as in loop above
#define res R1 // as in loop aboe
#define bit R2 // as in loop above
#define temp R3 // the quantity bit + res
#define zcount R4 // leading zeroes
#define res_shift R5 // the quantity res >> 1
#define bit_shift R6 // the quantity bit >> 2

SignalHexagonSqrt32:
// Set bit to the largest even-power of two
// that is less than or equal to the input
{
res = #0 // return value
bit = ##1073741824 // 2^30
zcount = cl0(num) // count leading zeroes
}
zcount = clrbit(zcount, #0) // even power of 2
{
bit = lsr(bit, zcount) // 2^30 right shifted
if (cmp.eq(bit.new, #0)) jump:nt .done // return if bit == 0
}
.falign
.loop:
{
// Calculate quantities to be used in the conditional below
temp = add(bit, res)
res_shift = lsr(res, #1)
bit_shift = lsr(bit, #2)
}
{
// Conditionally assign to num and res
p0 = cmp.ltu(temp, num)
if (p0.new) num = sub(num, temp)
if (p0.new) res = add(res_shift, bit)
if (!p0.new) res = res_shift
}
{
// Advance bit >> 2 and exit loop if done
bit = bit_shift
if (cmp.gt(bit.new, #0)) jump:t .loop
}
.falign
.done:
// if (num > res && res != 0xffff) {
// ++res
// }
// return res in num (R0)
{
temp = ##65535
}
{
p0 = cmp.gt(num, res)
p0 = !cmp.eq(res, temp)
if (p0.new) num = add(res, #1)
if (!p0.new) num = res
}
{
jumpr r31
}
.size SignalHexagonSqrt32, .-SignalHexagonSqrt32
188 changes: 188 additions & 0 deletions signal/micro/kernels/hexagon/rfft_int16.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <stddef.h>
#include <stdint.h>
#include <string.h>

#include "signal/src/msb.h"
#include "signal/src/rfft.h"

// Do not reorder these headers. "typedef.h" must appear before rfft.h
#include "typedef.h"
extern "C" {
#include "rfft.h"
}

namespace tflm_signal {

// TODO(b/467010877) The twiddle tables should come from the tflite file
// This array includes the first 3*512/4=384 elements in the twiddle table in:
// HEXAGON_Tools/<TOOL_VERSION>/Examples/libcore/SigProc/cxFFT_IFFT/include
const uint16_t twiddles[] __attribute__((aligned(8))) = {
0x7fff, 0x0000, 0x0000, 0x8000, 0x0000, 0x8000, 0x0000, 0x8000, 0xa57e,
0xa57e, 0xa57e, 0x5a82, 0x5a82, 0xa57e, 0xcf05, 0x89bf, 0x89bf, 0xcf05,
0xa57e, 0xa57e, 0x89bf, 0xcf05, 0x30fc, 0x7642, 0x7642, 0xcf05, 0xe708,
0x8276, 0xb8e4, 0x9593, 0xcf05, 0x89bf, 0x9593, 0xb8e4, 0xe708, 0x7d8a,
0x30fc, 0x89bf, 0xb8e4, 0x9593, 0x8276, 0x18f9, 0x89bf, 0xcf05, 0x8276,
0xe708, 0x6a6e, 0x471d, 0x7d8a, 0xe708, 0xf375, 0x809e, 0xdad8, 0x8583,
0xe708, 0x8276, 0x9d0e, 0xaecd, 0xc3aa, 0x70e3, 0x471d, 0x9593, 0xc3aa,
0x8f1e, 0x809e, 0xf375, 0x9593, 0xb8e4, 0x8583, 0xdad8, 0x5134, 0x62f2,
0x6a6e, 0xb8e4, 0xdad8, 0x8583, 0x9d0e, 0xaecd, 0xb8e4, 0x9593, 0x8f1e,
0xc3aa, 0x0c8c, 0x7f62, 0x18f9, 0x8276, 0xaecd, 0x9d0e, 0x8f1e, 0x3c57,
0x8276, 0xe708, 0x809e, 0xf375, 0x7a7d, 0x2528, 0x7f62, 0xf375, 0xf9b9,
0x8028, 0xed38, 0x8163, 0xf375, 0x809e, 0xa129, 0xaa0b, 0xb3c1, 0x66d0,
0x5134, 0x9d0e, 0xc946, 0x8c4b, 0x83d7, 0xe0e7, 0x9d0e, 0xaecd, 0x877c,
0xd4e1, 0x41ce, 0x6dca, 0x70e3, 0xc3aa, 0xe0e7, 0x83d7, 0xaa0b, 0xa129,
0xc3aa, 0x8f1e, 0x9236, 0xbe32, 0xf9b9, 0x7fd9, 0x2528, 0x8583, 0xb3c1,
0x9931, 0x877c, 0x2b1f, 0x8583, 0xdad8, 0x8163, 0xed38, 0x73b6, 0x36ba,
0x7a7d, 0xdad8, 0xed38, 0x8163, 0xc946, 0x8c4b, 0xdad8, 0x8583, 0x9931,
0xb3c1, 0xd4e1, 0x7885, 0x3c57, 0x8f1e, 0xbe32, 0x9236, 0x8028, 0x0648,
0x8f1e, 0xc3aa, 0x83d7, 0xe0e7, 0x5ed7, 0x55f6, 0x62f2, 0xaecd, 0xd4e1,
0x877c, 0x9236, 0xbe32, 0xaecd, 0x9d0e, 0x8c4b, 0xc946, 0x1f1a, 0x7c2a,
0x0c8c, 0x809e, 0xaa0b, 0xa129, 0x9931, 0x4c40, 0x809e, 0xf375, 0x8028,
0xf9b9, 0x7e9d, 0x12c8, 0x7fd9, 0xf9b9, 0xfcdc, 0x800a, 0xf696, 0x8059,
0xf9b9, 0x8028, 0xa34c, 0xa7be, 0xac65, 0x60ec, 0x55f6, 0xa129, 0xcc22,
0x8afc, 0x8676, 0xd7da, 0xa129, 0xaa0b, 0x8894, 0xd1ef, 0x398d, 0x7255,
0x73b6, 0xc946, 0xe3f5, 0x831d, 0xb141, 0x9b18, 0xc946, 0x8c4b, 0x93dc,
0xbb86, 0xf055, 0x7f0a, 0x2b1f, 0x877c, 0xb64c, 0x975a, 0x84a3, 0x2224,
0x877c, 0xd4e1, 0x81e3, 0xea1e, 0x6f5f, 0x3f17, 0x7c2a, 0xe0e7, 0xf055,
0x80f7, 0xd1ef, 0x8894, 0xe0e7, 0x83d7, 0x9b18, 0xb141, 0xcc22, 0x7505,
0x41ce, 0x9236, 0xc0e9, 0x90a1, 0x800a, 0xfcdc, 0x9236, 0xbe32, 0x84a3,
0xdddd, 0x5843, 0x5cb4, 0x66d0, 0xb3c1, 0xd7da, 0x8676, 0x975a, 0xb64c,
0xb3c1, 0x9931, 0x8dab, 0xc674, 0x15e2, 0x7e1e, 0x12c8, 0x8163, 0xac65,
0x9f14, 0x93dc, 0x447b, 0x8163, 0xed38, 0x8059, 0xf696, 0x7ce4, 0x1c0c,
0x7e9d, 0xed38, 0xf696, 0x8059, 0xe3f5, 0x831d, 0xed38, 0x8163, 0x9f14,
0xac65, 0xbb86, 0x6c24, 0x4c40, 0x9931, 0xc674, 0x8dab, 0x81e3, 0xea1e,
0x9931, 0xb3c1, 0x8676, 0xd7da, 0x49b4, 0x68a7, 0x6dca, 0xbe32, 0xdddd,
0x84a3, 0xa34c, 0xa7be, 0xbe32, 0x9236, 0x90a1, 0xc0e9, 0x0324, 0x7ff6,
0x1f1a, 0x83d7, 0xb141, 0x9b18, 0x8afc, 0x33df, 0x83d7, 0xe0e7, 0x80f7,
0xf055, 0x776c, 0x2e11, 0x7885, 0xd4e1, 0xea1e, 0x81e3, 0xc0e9, 0x90a1,
0xd4e1, 0x877c, 0x975a, 0xb64c, 0xdddd, 0x7b5d, 0x36ba, 0x8c4b, 0xbb86,
0x93dc, 0x80f7, 0x0fab, 0x8c4b, 0xc946, 0x831d, 0xe3f5, 0x64e9, 0x4ec0,
0x5ed7, 0xaa0b, 0xd1ef, 0x8894, 0x8dab, 0xc674, 0xaa0b, 0xa129, 0x8afc,
0xcc22, 0x2827, 0x798a, 0x0648, 0x8028, 0xa7be, 0xa34c, 0x9f14, 0x539b,
0x8028, 0xf9b9, 0x800a, 0xfcdc, 0x7fa7, 0x096b, 0x7ff6, 0xfcdc, 0xfe6e,
0x8003, 0xfb4a, 0x8017, 0xfcdc, 0x800a, 0xa463, 0xa69c, 0xa8e3, 0x5dc8,
0x5843, 0xa34c, 0xcd92, 0x8a5b, 0x8806, 0xd368, 0xa34c, 0xa7be, 0x8927,
0xd079, 0x354e, 0x7460, 0x7505, 0xcc22, 0xe57e, 0x82c7, 0xb505, 0x9843,
0xcc22, 0x8afc, 0x94b6, 0xba33, 0xebab, 0x7e60, 0x2e11, 0x8894, 0xb797,
0x9674, 0x8377, 0x1d93, 0x8894, 0xd1ef, 0x822a, 0xe893, 0x6cf9, 0x4326,
0x7ce4, 0xe3f5, 0xf1e5, 0x80c8, 0xd65d, 0x86f7, 0xe3f5, 0x831d, 0x9c11,
0xb005, 0xc7dc, 0x7308, 0x447b, 0x93dc, 0xc248, 0x8fdd, 0x803e, 0xf827,
0x93dc, 0xbb86, 0x8511, 0xdc5a, 0x54ca, 0x5fe4, 0x68a7, 0xb64c, 0xd958,
0x85fb, 0x9a23, 0xb27f, 0xb64c, 0x975a, 0x8e62, 0xc50e, 0x113a, 0x7ed6,
0x15e2, 0x81e3, 0xad97, 0x9e0f, 0x916a, 0x4074, 0x81e3, 0xea1e, 0x8079,
0xf505, 0x7bc6, 0x209f, 0x7f0a, 0xf055, 0xf827, 0x803e, 0xe893, 0x822a,
0xf055, 0x80f7, 0xa01d, 0xab36, 0xb797, 0x698c, 0x4ec0, 0x9b18, 0xc7dc,
0x8cf9, 0x82c7, 0xe57e, 0x9b18, 0xb141, 0x86f7, 0xd65d, 0x45cd, 0x6b4b,
0x6f5f, 0xc0e9, 0xdf61, 0x843b, 0xa69c, 0xa463, 0xc0e9, 0x90a1, 0x916a,
0xbf8d, 0xfe6e, 0x7ffe, 0x2224, 0x84a3, 0xb27f, 0x9a23, 0x8927, 0x2f87,
0x84a3, 0xdddd, 0x812b, 0xeec7, 0x75a6, 0x326e, 0x798a, 0xd7da, 0xebab,
0x81a1, 0xc50e, 0x8e62, 0xd7da, 0x8676, 0x9843, 0xb505, 0xd958, 0x7a06,
0x398d, 0x8dab, 0xbcdb, 0x9307, 0x8079, 0x0afb, 0x8dab, 0xc674, 0x8377,
0xe26d, 0x61f1, 0x5269, 0x60ec, 0xac65, 0xd368, 0x8806, 0x8fdd, 0xc248,
0xac65, 0x9f14, 0x8ba1, 0xcab3, 0x23a7, 0x7aef, 0x096b, 0x8059, 0xa8e3,
0xa239, 0x9c11, 0x4ffb, 0x8059, 0xf696, 0x8017, 0xfb4a, 0x7f38, 0x0e1c,
0x7fa7, 0xf696, 0xfb4a, 0x8017, 0xf1e5, 0x80c8, 0xf696, 0x8059, 0xa239,
0xa8e3, 0xb005, 0x63ef, 0x539b, 0x9f14, 0xcab3, 0x8ba1, 0x8511, 0xdc5a,
0x9f14, 0xac65, 0x8806, 0xd368, 0x3db8, 0x7023, 0x7255, 0xc674, 0xe26d,
0x8377, 0xad97, 0x9e0f, 0xc674, 0x8dab, 0x9307, 0xbcdb, 0xf505, 0x7f87,
0x2827, 0x8676, 0xb505, 0x9843, 0x85fb, 0x26a8, 0x8676, 0xd7da, 0x81a1,
0xebab, 0x719e, 0x3af3, 0x7b5d, 0xdddd, 0xeec7, 0x812b, 0xcd92, 0x8a5b,
0xdddd, 0x84a3, 0x9a23, 0xb27f, 0xd079, 0x76d9, 0x3f17, 0x90a1, 0xbf8d,
0x916a, 0x8003, 0x0192, 0x90a1, 0xc0e9, 0x843b, 0xdf61, 0x5b9d, 0x5964,
0x64e9, 0xb141, 0xd65d, 0x86f7, 0x94b6, 0xba33, 0xb141, 0x9b18, 0x8cf9,
0xc7dc, 0x1a83, 0x7d3a, 0x0fab, 0x80f7, 0xab36, 0xa01d, 0x9674, 0x486a,
0x80f7, 0xf055, 0x803e, 0xf827, 0x7dd6, 0x176e, 0x7e1e, 0xea1e, 0xf505,
0x8079, 0xdf61, 0x843b, 0xea1e, 0x81e3, 0x9e0f, 0xad97, 0xbf8d, 0x6e97,
0x49b4, 0x975a, 0xc50e, 0x8e62, 0x812b, 0xeec7, 0x975a, 0xb64c, 0x85fb,
0xd958, 0x4d81, 0x65de, 0x6c24, 0xbb86, 0xdc5a, 0x8511, 0xa01d, 0xab36,
0xbb86, 0x93dc, 0x8fdd, 0xc248, 0x07d9, 0x7fc2, 0x1c0c, 0x831d, 0xb005,
0x9c11, 0x8cf9, 0x3825, 0x831d, 0xe3f5, 0x80c8, 0xf1e5, 0x790a, 0x29a4,
0x776c, 0xd1ef, 0xe893, 0x822a, 0xbcdb, 0x9307, 0xd1ef, 0x8894, 0x9674,
0xb797, 0xe26d, 0x7c89, 0x33df, 0x8afc, 0xba33, 0x94b6, 0x81a1, 0x1455,
0x8afc, 0xcc22, 0x82c7, 0xe57e, 0x67bd, 0x4afb, 0x5cb4, 0xa7be, 0xd079,
0x8927, 0x8ba1, 0xcab3, 0xa7be, 0xa34c, 0x8a5b, 0xcd92, 0x2c99, 0x77fb,
0x0324, 0x800a, 0xa69c, 0xa463, 0xa239, 0x571e, 0x800a, 0xfcdc, 0x8003,
0xfe6e, 0x7fea, 0x04b6};

// Twiddle factors used for the last stage of N-point real FFT
// generated as j*W^k, k=1, 2, ... N/4
// That's 128 complex int16_t elements
// Or 256 real int16_t elements
const uint16_t rtwiddles[] __attribute__((aligned(8))) = {
0x0192, 0x7ffe, 0x0324, 0x7ff6, 0x04b6, 0x7fea, 0x0648, 0x7fd9, 0x07d9,
0x7fc2, 0x096b, 0x7fa7, 0x0afb, 0x7f87, 0x0c8c, 0x7f62, 0x0e1c, 0x7f38,
0x0fab, 0x7f0a, 0x113a, 0x7ed6, 0x12c8, 0x7e9d, 0x1455, 0x7e60, 0x15e2,
0x7e1e, 0x176e, 0x7dd6, 0x18f9, 0x7d8a, 0x1a83, 0x7d3a, 0x1c0c, 0x7ce4,
0x1d93, 0x7c89, 0x1f1a, 0x7c2a, 0x209f, 0x7bc6, 0x2224, 0x7b5d, 0x23a7,
0x7aef, 0x2528, 0x7a7d, 0x26a8, 0x7a06, 0x2827, 0x798a, 0x29a4, 0x790a,
0x2b1f, 0x7885, 0x2c99, 0x77fb, 0x2e11, 0x776c, 0x2f87, 0x76d9, 0x30fc,
0x7642, 0x326e, 0x75a6, 0x33df, 0x7505, 0x354e, 0x7460, 0x36ba, 0x73b6,
0x3825, 0x7308, 0x398d, 0x7255, 0x3af3, 0x719e, 0x3c57, 0x70e3, 0x3db8,
0x7023, 0x3f17, 0x6f5f, 0x4074, 0x6e97, 0x41ce, 0x6dca, 0x4326, 0x6cf9,
0x447b, 0x6c24, 0x45cd, 0x6b4b, 0x471d, 0x6a6e, 0x486a, 0x698c, 0x49b4,
0x68a7, 0x4afb, 0x67bd, 0x4c40, 0x66d0, 0x4d81, 0x65de, 0x4ec0, 0x64e9,
0x4ffb, 0x63ef, 0x5134, 0x62f2, 0x5269, 0x61f1, 0x539b, 0x60ec, 0x54ca,
0x5fe4, 0x55f6, 0x5ed7, 0x571e, 0x5dc8, 0x5843, 0x5cb4, 0x5964, 0x5b9d,
0x5a82, 0x5a82, 0x5b9d, 0x5964, 0x5cb4, 0x5843, 0x5dc8, 0x571e, 0x5ed7,
0x55f6, 0x5fe4, 0x54ca, 0x60ec, 0x539b, 0x61f1, 0x5269, 0x62f2, 0x5134,
0x63ef, 0x4ffb, 0x64e9, 0x4ec0, 0x65de, 0x4d81, 0x66d0, 0x4c40, 0x67bd,
0x4afb, 0x68a7, 0x49b4, 0x698c, 0x486a, 0x6a6e, 0x471d, 0x6b4b, 0x45cd,
0x6c24, 0x447b, 0x6cf9, 0x4326, 0x6dca, 0x41ce, 0x6e97, 0x4074, 0x6f5f,
0x3f17, 0x7023, 0x3db8, 0x70e3, 0x3c57, 0x719e, 0x3af3, 0x7255, 0x398d,
0x7308, 0x3825, 0x73b6, 0x36ba, 0x7460, 0x354e, 0x7505, 0x33df, 0x75a6,
0x326e, 0x7642, 0x30fc, 0x76d9, 0x2f87, 0x776c, 0x2e11, 0x77fb, 0x2c99,
0x7885, 0x2b1f, 0x790a, 0x29a4, 0x798a, 0x2827, 0x7a06, 0x26a8, 0x7a7d,
0x2528, 0x7aef, 0x23a7, 0x7b5d, 0x2224, 0x7bc6, 0x209f, 0x7c2a, 0x1f1a,
0x7c89, 0x1d93, 0x7ce4, 0x1c0c, 0x7d3a, 0x1a83, 0x7d8a, 0x18f9, 0x7dd6,
0x176e, 0x7e1e, 0x15e2, 0x7e60, 0x1455, 0x7e9d, 0x12c8, 0x7ed6, 0x113a,
0x7f0a, 0x0fab, 0x7f38, 0x0e1c, 0x7f62, 0x0c8c, 0x7f87, 0x0afb, 0x7fa7,
0x096b, 0x7fc2, 0x07d9, 0x7fd9, 0x0648, 0x7fea, 0x04b6, 0x7ff6, 0x0324,
0x7ffe, 0x0192, 0x7fff, 0x0000};

struct RfftState {
int16_t* aligned_input;
int32_t fft_length;
};

size_t RfftInt16GetNeededMemory(int32_t fft_length) {
return sizeof(RfftState) + 2 * sizeof(int16_t) * fft_length;
}

void* RfftInt16Init(int32_t fft_length, void* state, size_t state_size) {
RfftState* rfft_state = (RfftState*)state;
int16_t* unaligned_buffer = (int16_t*)(rfft_state + 1);
rfft_state->aligned_input =
(int16_t*)((uint32_t)(unaligned_buffer + fft_length) &
~((1 << tflite::tflm_signal::MostSignificantBit32(
fft_length)) -
1));
rfft_state->fft_length = fft_length;
return state;
}

void RfftInt16Apply(void* state, const int16_t* input,
Complex<int16_t>* output) {
RfftState* rfft_state = (RfftState*)state;
memcpy(rfft_state->aligned_input, input,
rfft_state->fft_length * sizeof(int16_t));
rfft(rfft_state->aligned_input, rfft_state->fft_length, (CWord2x16*)twiddles,
(CWord2x16*)rtwiddles, (CWord2x16*)output);
return;
}

} // namespace tflm_signal

28 changes: 28 additions & 0 deletions signal/micro/kernels/hexagon/square_root_32.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "signal/src/square_root.h"

extern "C" uint16_t SignalHexagonSqrt32(uint32_t num);

namespace tflite {
namespace tflm_signal {

// SignalHexagonSqrt32() is defined in assembly. This C wrapper is only
// necessary to force TFLM's source specialization to pick up the optimized
// Hexagon implementation instead of the portable one.
uint16_t Sqrt32(uint32_t num) { return SignalHexagonSqrt32(num); }

} // namespace tflm_signal
} // namespace tflite
Loading
Loading