mirror of https://github.com/g4klx/MMDVM.git
Get it compiling but still incomplete.
This commit is contained in:
parent
d9932fe5c0
commit
05d1e7e9dc
|
@ -1,7 +1,3 @@
|
|||
*.o
|
||||
obj/
|
||||
bin/
|
||||
GitVersion.h
|
||||
build/
|
||||
.pio
|
||||
.vscode
|
||||
|
||||
|
|
|
@ -15,13 +15,10 @@ framework = arduino
|
|||
build_unflags = -O0
|
||||
monitor_speed = 460800
|
||||
build_flags =
|
||||
-O3
|
||||
-Wall
|
||||
; -D ARM_MATH_CM7
|
||||
; -mfpu=fpv5-sp-d16
|
||||
-D ARM_MATH_CM4
|
||||
-mfpu=fpv4-sp-d16
|
||||
-mfloat-abi=hard
|
||||
; increase the buffer sizes
|
||||
-D SERIAL_RX_BUFFER_SIZE=2048
|
||||
-D SERIAL_TX_BUFFER_SIZE=2048
|
||||
-O3
|
||||
-Wall
|
||||
-D ARM_MATH_CM4
|
||||
-mfpu=fpv4-sp-d16
|
||||
-mfloat-abi=hard
|
||||
-D SERIAL_RX_BUFFER_SIZE=2048
|
||||
-D SERIAL_TX_BUFFER_SIZE=2048
|
||||
|
|
|
@ -0,0 +1,509 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
* Project: CMSIS DSP Library
|
||||
* Title: arm_biquad_cascade_df1_q31.c
|
||||
* Description: Processing function for the Q31 Biquad cascade filter
|
||||
*
|
||||
* $Date: 23 April 2021
|
||||
* $Revision: V1.9.0
|
||||
*
|
||||
* Target Processor: Cortex-M and Cortex-A cores
|
||||
* -------------------------------------------------------------------- */
|
||||
/*
|
||||
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the License); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dsp/filtering_functions.h"
|
||||
|
||||
/**
|
||||
@ingroup groupFilters
|
||||
*/
|
||||
|
||||
/**
|
||||
@addtogroup BiquadCascadeDF1
|
||||
@{
|
||||
*/
|
||||
|
||||
/**
|
||||
@brief Processing function for the Q31 Biquad cascade filter.
|
||||
@param[in] S points to an instance of the Q31 Biquad cascade structure
|
||||
@param[in] pSrc points to the block of input data
|
||||
@param[out] pDst points to the block of output data
|
||||
@param[in] blockSize number of samples to process
|
||||
|
||||
@par Scaling and Overflow Behavior
|
||||
The function is implemented using an internal 64-bit accumulator.
|
||||
The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
|
||||
Thus, if the accumulator result overflows it wraps around rather than clip.
|
||||
In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
|
||||
After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by <code>postShift</code> bits and the result truncated to
|
||||
1.31 format by discarding the low 32 bits.
|
||||
@remark
|
||||
Refer to \ref arm_biquad_cascade_df1_fast_q31() for a faster but less precise implementation of this filter.
|
||||
*/
|
||||
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
|
||||
|
||||
__STATIC_INLINE void arm_biquad_cascade_df1_q31(
|
||||
const arm_biquad_casd_df1_inst_q31 * S,
|
||||
const q31_t * pSrc,
|
||||
q31_t * pDst,
|
||||
uint32_t blockSize)
|
||||
{
|
||||
const q31_t *pIn = pSrc; /* input pointer initialization */
|
||||
q31_t *pOut = pDst; /* output pointer initialization */
|
||||
int shift;
|
||||
uint32_t stages = S->numStages; /* loop counters */
|
||||
int postShift = S->postShift;
|
||||
q31x4_t b0Coeffs, b1Coeffs, a0Coeffs, a1Coeffs; /* Coefficients vector */
|
||||
q31x4_t stateVec = { 0 };
|
||||
q31_t *pState = S->pState; /* pState pointer initialization */
|
||||
q31x4_t inVec0;
|
||||
int64_t acc;
|
||||
const q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
|
||||
q31_t out, out1;
|
||||
|
||||
|
||||
shift = (postShift + 1 + 8);
|
||||
|
||||
do {
|
||||
/*
|
||||
* Reading the coefficients
|
||||
* generates :
|
||||
* Fwd0 { b2 b1 b0 0 }
|
||||
* Fwd1 { 0 b2 b1 b0 }
|
||||
* Bwd0 { 0 0 a2 a1 }
|
||||
* Bwd0 { 0 0 a1 a2 }
|
||||
* (can be moved in init)
|
||||
*/
|
||||
b0Coeffs = vdupq_n_s32(0);
|
||||
a0Coeffs = vdupq_n_s32(0);
|
||||
|
||||
b0Coeffs[0] = pCoeffs[2]; // b2
|
||||
b0Coeffs[1] = pCoeffs[1]; // b1
|
||||
b0Coeffs[2] = pCoeffs[0]; // b0
|
||||
|
||||
b1Coeffs = b0Coeffs;
|
||||
uint32_t zero = 0;
|
||||
b1Coeffs = vshlcq_s32(b1Coeffs, &zero, 32);
|
||||
|
||||
a0Coeffs[2] = pCoeffs[4];
|
||||
a0Coeffs[3] = pCoeffs[3];
|
||||
a1Coeffs = vrev64q_s32(a0Coeffs);
|
||||
|
||||
|
||||
/*
|
||||
* prologue consumes history samples
|
||||
*/
|
||||
|
||||
/* 2 first elements are garbage, will be updated with history */
|
||||
inVec0 = vld1q(pIn - 2);
|
||||
|
||||
inVec0[0] = pState[1];
|
||||
inVec0[1] = pState[0];
|
||||
|
||||
stateVec[2] = pState[3];
|
||||
stateVec[3] = pState[2];
|
||||
|
||||
acc = vrmlaldavhq(b0Coeffs, inVec0);
|
||||
acc = vrmlaldavhaq(acc, a0Coeffs, stateVec);
|
||||
acc = lsll(acc, shift);
|
||||
out = (q31_t) ((acc >> 32) & 0xffffffff);
|
||||
|
||||
stateVec[2] = out;
|
||||
acc = vrmlaldavhq(b1Coeffs, inVec0);
|
||||
acc = vrmlaldavhaq(acc, a1Coeffs, stateVec);
|
||||
|
||||
acc = lsll(acc, shift);
|
||||
out1 = (q31_t) ((acc >> 32) & 0xffffffff);
|
||||
|
||||
|
||||
inVec0 = vld1q(pIn);
|
||||
pIn += 2;
|
||||
|
||||
/*
|
||||
* main loop
|
||||
*/
|
||||
uint32_t sample = (blockSize - 2) >> 2U;
|
||||
/*
|
||||
* First part of the processing with loop unrolling.
|
||||
* Compute 4 outputs at a time.
|
||||
*/
|
||||
while (sample > 0U) {
|
||||
|
||||
stateVec[3] = out1;
|
||||
|
||||
*pOut++ = out;
|
||||
*pOut++ = out1;
|
||||
|
||||
/*
|
||||
* in { x0 x1 x2 x3 }
|
||||
* x
|
||||
* b0Coeffs { b2 b1 b0 0 }
|
||||
*/
|
||||
acc = vrmlaldavhq(b0Coeffs, inVec0);
|
||||
/*
|
||||
* out { 0 0 yn2 yn1 }
|
||||
* x
|
||||
* a0Coeffs { 0 0 a2 a1 }
|
||||
*/
|
||||
acc = vrmlaldavhaq(acc, a0Coeffs, stateVec);
|
||||
acc = lsll(acc, shift);
|
||||
out = (q31_t) ((acc >> 32) & 0xffffffff);
|
||||
|
||||
stateVec[2] = out;
|
||||
|
||||
/*
|
||||
* in { x0 x1 x2 x3 }
|
||||
* x
|
||||
* b0Coeffs { 0 b2 b1 b0 }
|
||||
*/
|
||||
acc = vrmlaldavhq(b1Coeffs, inVec0);
|
||||
/*
|
||||
* out { 0 0 y0 yn1 }
|
||||
* x
|
||||
* a0Coeffs { 0 0 a1 a2 }
|
||||
*/
|
||||
acc = vrmlaldavhaq(acc, a1Coeffs, stateVec);
|
||||
acc = lsll(acc, shift);
|
||||
out1 = (q31_t) ((acc >> 32) & 0xffffffff);
|
||||
|
||||
stateVec[3] = out1;
|
||||
|
||||
inVec0 = vld1q(pIn);
|
||||
pIn += 2;
|
||||
|
||||
/* unrolled part */
|
||||
*pOut++ = out;
|
||||
*pOut++ = out1;
|
||||
|
||||
acc = vrmlaldavhq(b0Coeffs, inVec0);
|
||||
acc = vrmlaldavhaq(acc, a0Coeffs, stateVec);
|
||||
acc = lsll(acc, shift);
|
||||
out = (q31_t) ((acc >> 32) & 0xffffffff);
|
||||
|
||||
stateVec[2] = out;
|
||||
|
||||
acc = vrmlaldavhq(b1Coeffs, inVec0);
|
||||
acc = vrmlaldavhaq(acc, a1Coeffs, stateVec);
|
||||
acc = lsll(acc, shift);
|
||||
out1 = (q31_t) ((acc >> 32) & 0xffffffff);
|
||||
|
||||
inVec0 = vld1q(pIn);
|
||||
pIn += 2;
|
||||
|
||||
sample--;
|
||||
}
|
||||
|
||||
*pOut++ = out;
|
||||
*pOut++ = out1;
|
||||
|
||||
/*
|
||||
* Tail handling
|
||||
*/
|
||||
int32_t loopRemainder = blockSize & 3;
|
||||
if (loopRemainder == 2) {
|
||||
/*
|
||||
* Store the updated state variables back into the pState array
|
||||
*/
|
||||
pState[0] = inVec0[1];
|
||||
pState[1] = inVec0[0];
|
||||
pState[3] = out;
|
||||
pState[2] = out1;
|
||||
} else if (loopRemainder == 1) {
|
||||
stateVec[3] = out1;
|
||||
|
||||
acc = vrmlaldavhq(b0Coeffs, inVec0);
|
||||
acc = vrmlaldavhaq(acc, a0Coeffs, stateVec);
|
||||
acc = lsll(acc, shift);
|
||||
out = (q31_t) ((acc >> 32) & 0xffffffff);
|
||||
|
||||
stateVec[2] = out;
|
||||
|
||||
acc = vrmlaldavhq(b1Coeffs, inVec0);
|
||||
acc = vrmlaldavhaq(acc, a1Coeffs, stateVec);
|
||||
acc = lsll(acc, shift);
|
||||
out1 = (q31_t) ((acc >> 32) & 0xffffffff);
|
||||
|
||||
stateVec[3] = out1;
|
||||
|
||||
inVec0 = vld1q(pIn);
|
||||
pIn += 2;
|
||||
|
||||
*pOut++ = out;
|
||||
*pOut++ = out1;
|
||||
|
||||
acc = vrmlaldavhq(b0Coeffs, inVec0);
|
||||
acc = vrmlaldavhaq(acc, a0Coeffs, stateVec);
|
||||
acc = lsll(acc, shift);
|
||||
out = (q31_t) ((acc >> 32) & 0xffffffff);
|
||||
|
||||
*pOut++ = out;
|
||||
|
||||
/*
|
||||
* Store the updated state variables back into the pState array
|
||||
*/
|
||||
pState[0] = inVec0[2];
|
||||
pState[1] = inVec0[1];
|
||||
pState[3] = out1;
|
||||
pState[2] = out;
|
||||
} else if (loopRemainder == 0) {
|
||||
stateVec[3] = out1;
|
||||
|
||||
acc = vrmlaldavhq(b0Coeffs, inVec0);
|
||||
acc = vrmlaldavhaq(acc, a0Coeffs, stateVec);
|
||||
acc = lsll(acc, shift);
|
||||
out = (q31_t) ((acc >> 32) & 0xffffffff);
|
||||
|
||||
stateVec[2] = out;
|
||||
|
||||
acc = vrmlaldavhq(b1Coeffs, inVec0);
|
||||
acc = vrmlaldavhaq(acc, a1Coeffs, stateVec);
|
||||
acc = lsll(acc, shift);
|
||||
out1 = (q31_t) ((acc >> 32) & 0xffffffff);
|
||||
|
||||
*pOut++ = out;
|
||||
*pOut++ = out1;
|
||||
|
||||
/*
|
||||
* Store the updated state variables back into the pState array
|
||||
*/
|
||||
pState[0] = inVec0[3];
|
||||
pState[1] = inVec0[2];
|
||||
pState[3] = out;
|
||||
pState[2] = out1;
|
||||
} else {
|
||||
stateVec[3] = out1;
|
||||
|
||||
acc = vrmlaldavhq(b0Coeffs, inVec0);
|
||||
acc = vrmlaldavhaq(acc, a0Coeffs, stateVec);
|
||||
acc = lsll(acc, shift);
|
||||
out = (q31_t) ((acc >> 32) & 0xffffffff);
|
||||
|
||||
*pOut++ = out;
|
||||
|
||||
/*
|
||||
* Store the updated state variables back into the pState array
|
||||
*/
|
||||
pState[0] = inVec0[2];
|
||||
pState[1] = inVec0[1];
|
||||
pState[3] = out1;
|
||||
pState[2] = out;
|
||||
}
|
||||
|
||||
|
||||
pCoeffs += 5;
|
||||
pState += 4;
|
||||
|
||||
/* The first stage goes from the input buffer to the output buffer. */
|
||||
/* Subsequent stages occur in-place in the output buffer */
|
||||
pIn = pDst;
|
||||
|
||||
/* Reset to destination pointer */
|
||||
pOut = pDst;
|
||||
}
|
||||
while (--stages);
|
||||
}
|
||||
#else
|
||||
void arm_biquad_cascade_df1_q31(
|
||||
const arm_biquad_casd_df1_inst_q31 * S,
|
||||
const q31_t * pSrc,
|
||||
q31_t * pDst,
|
||||
uint32_t blockSize)
|
||||
{
|
||||
const q31_t *pIn = pSrc; /* Source pointer */
|
||||
q31_t *pOut = pDst; /* Destination pointer */
|
||||
q31_t *pState = S->pState; /* pState pointer */
|
||||
const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
|
||||
q63_t acc; /* Accumulator */
|
||||
q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
|
||||
q31_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
|
||||
q31_t Xn; /* Temporary input */
|
||||
uint32_t uShift = ((uint32_t) S->postShift + 1U);
|
||||
uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
|
||||
uint32_t sample, stage = S->numStages; /* Loop counters */
|
||||
|
||||
#if defined (ARM_MATH_LOOPUNROLL)
|
||||
q31_t acc_l, acc_h; /* temporary output variables */
|
||||
#endif
|
||||
|
||||
do
|
||||
{
|
||||
/* Reading the coefficients */
|
||||
b0 = *pCoeffs++;
|
||||
b1 = *pCoeffs++;
|
||||
b2 = *pCoeffs++;
|
||||
a1 = *pCoeffs++;
|
||||
a2 = *pCoeffs++;
|
||||
|
||||
/* Reading the pState values */
|
||||
Xn1 = pState[0];
|
||||
Xn2 = pState[1];
|
||||
Yn1 = pState[2];
|
||||
Yn2 = pState[3];
|
||||
|
||||
#if defined (ARM_MATH_LOOPUNROLL)
|
||||
|
||||
/* Apply loop unrolling and compute 4 output values simultaneously. */
|
||||
/* Variable acc hold output values that are being computed:
|
||||
*
|
||||
* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
|
||||
*/
|
||||
|
||||
/* Loop unrolling: Compute 4 outputs at a time */
|
||||
sample = blockSize >> 2U;
|
||||
|
||||
while (sample > 0U)
|
||||
{
|
||||
/* Read the first input */
|
||||
Xn = *pIn++;
|
||||
|
||||
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
|
||||
acc = ((q63_t) b0 * Xn) + ((q63_t) b1 * Xn1) + ((q63_t) b2 * Xn2) + ((q63_t) a1 * Yn1) + ((q63_t) a2 * Yn2);
|
||||
|
||||
/* The result is converted to 1.31 , Yn2 variable is reused */
|
||||
acc_l = (acc ) & 0xffffffff; /* Calc lower part of acc */
|
||||
acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
|
||||
|
||||
/* Apply shift for lower part of acc and upper part of acc */
|
||||
Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift;
|
||||
|
||||
/* Store output in destination buffer. */
|
||||
*pOut++ = Yn2;
|
||||
|
||||
/* Read the second input */
|
||||
Xn2 = *pIn++;
|
||||
|
||||
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
|
||||
acc = ((q63_t) b0 * Xn2) + ((q63_t) b1 * Xn) + ((q63_t) b2 * Xn1) + ((q63_t) a1 * Yn2) + ((q63_t) a2 * Yn1);
|
||||
|
||||
/* The result is converted to 1.31, Yn1 variable is reused */
|
||||
acc_l = (acc ) & 0xffffffff; /* Calc lower part of acc */
|
||||
acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
|
||||
|
||||
/* Apply shift for lower part of acc and upper part of acc */
|
||||
Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift;
|
||||
|
||||
/* Store output in destination buffer. */
|
||||
*pOut++ = Yn1;
|
||||
|
||||
/* Read the third input */
|
||||
Xn1 = *pIn++;
|
||||
|
||||
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
|
||||
acc = ((q63_t) b0 * Xn1) + ((q63_t) b1 * Xn2) + ((q63_t) b2 * Xn) + ((q63_t) a1 * Yn1) + ((q63_t) a2 * Yn2);
|
||||
|
||||
/* The result is converted to 1.31, Yn2 variable is reused */
|
||||
acc_l = (acc ) & 0xffffffff; /* Calc lower part of acc */
|
||||
acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
|
||||
|
||||
/* Apply shift for lower part of acc and upper part of acc */
|
||||
Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift;
|
||||
|
||||
/* Store output in destination buffer. */
|
||||
*pOut++ = Yn2;
|
||||
|
||||
/* Read the forth input */
|
||||
Xn = *pIn++;
|
||||
|
||||
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
|
||||
acc = ((q63_t) b0 * Xn) + ((q63_t) b1 * Xn1) + ((q63_t) b2 * Xn2) + ((q63_t) a1 * Yn2) + ((q63_t) a2 * Yn1);
|
||||
|
||||
/* The result is converted to 1.31, Yn1 variable is reused */
|
||||
acc_l = (acc ) & 0xffffffff; /* Calc lower part of acc */
|
||||
acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
|
||||
|
||||
/* Apply shift for lower part of acc and upper part of acc */
|
||||
Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift;
|
||||
|
||||
/* Store output in destination buffer. */
|
||||
*pOut++ = Yn1;
|
||||
|
||||
/* Every time after the output is computed state should be updated. */
|
||||
/* The states should be updated as: */
|
||||
/* Xn2 = Xn1 */
|
||||
/* Xn1 = Xn */
|
||||
/* Yn2 = Yn1 */
|
||||
/* Yn1 = acc */
|
||||
Xn2 = Xn1;
|
||||
Xn1 = Xn;
|
||||
|
||||
/* decrement loop counter */
|
||||
sample--;
|
||||
}
|
||||
|
||||
/* Loop unrolling: Compute remaining outputs */
|
||||
sample = blockSize & 0x3U;
|
||||
|
||||
#else
|
||||
|
||||
/* Initialize blkCnt with number of samples */
|
||||
sample = blockSize;
|
||||
|
||||
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
|
||||
|
||||
while (sample > 0U)
|
||||
{
|
||||
/* Read the input */
|
||||
Xn = *pIn++;
|
||||
|
||||
/* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
|
||||
acc = ((q63_t) b0 * Xn) + ((q63_t) b1 * Xn1) + ((q63_t) b2 * Xn2) + ((q63_t) a1 * Yn1) + ((q63_t) a2 * Yn2);
|
||||
|
||||
/* The result is converted to 1.31 */
|
||||
acc = acc >> lShift;
|
||||
|
||||
/* Store output in destination buffer. */
|
||||
*pOut++ = (q31_t) acc;
|
||||
|
||||
/* Every time after the output is computed state should be updated. */
|
||||
/* The states should be updated as: */
|
||||
/* Xn2 = Xn1 */
|
||||
/* Xn1 = Xn */
|
||||
/* Yn2 = Yn1 */
|
||||
/* Yn1 = acc */
|
||||
Xn2 = Xn1;
|
||||
Xn1 = Xn;
|
||||
Yn2 = Yn1;
|
||||
Yn1 = (q31_t) acc;
|
||||
|
||||
/* decrement loop counter */
|
||||
sample--;
|
||||
}
|
||||
|
||||
/* Store the updated state variables back into the pState array */
|
||||
*pState++ = Xn1;
|
||||
*pState++ = Xn2;
|
||||
*pState++ = Yn1;
|
||||
*pState++ = Yn2;
|
||||
|
||||
/* The first stage goes from the input buffer to the output buffer. */
|
||||
/* Subsequent numStages occur in-place in the output buffer */
|
||||
pIn = pDst;
|
||||
|
||||
/* Reset output pointer */
|
||||
pOut = pDst;
|
||||
|
||||
/* decrement loop counter */
|
||||
stage--;
|
||||
|
||||
} while (stage > 0U);
|
||||
|
||||
}
|
||||
#endif /* defined(ARM_MATH_MVEI) */
|
||||
|
||||
/**
|
||||
@} end of BiquadCascadeDF1 group
|
||||
*/
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,332 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
* Project: CMSIS DSP Library
|
||||
* Title: arm_fir_fast_q15.c
|
||||
* Description: Q15 Fast FIR filter processing function
|
||||
*
|
||||
* $Date: 23 April 2021
|
||||
* $Revision: V1.9.0
|
||||
*
|
||||
* Target Processor: Cortex-M and Cortex-A cores
|
||||
* -------------------------------------------------------------------- */
|
||||
/*
|
||||
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the License); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dsp/filtering_functions.h"
|
||||
|
||||
/**
|
||||
@ingroup groupFilters
|
||||
*/
|
||||
|
||||
/**
|
||||
@addtogroup FIR
|
||||
@{
|
||||
*/
|
||||
|
||||
/**
|
||||
@brief Processing function for the Q15 FIR filter (fast version).
|
||||
@param[in] S points to an instance of the Q15 FIR filter structure
|
||||
@param[in] pSrc points to the block of input data
|
||||
@param[out] pDst points to the block of output data
|
||||
@param[in] blockSize number of samples to process
|
||||
@return none
|
||||
|
||||
@par Scaling and Overflow Behavior
|
||||
This fast version uses a 32-bit accumulator with 2.30 format.
|
||||
The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
|
||||
Thus, if the accumulator result overflows it wraps around and distorts the result.
|
||||
In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
|
||||
The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
|
||||
|
||||
@remark
|
||||
Refer to \ref arm_fir_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure.
|
||||
Use function \ref arm_fir_init_q15() to initialize the filter structure.
|
||||
*/
|
||||
|
||||
void arm_fir_fast_q15(
|
||||
const arm_fir_instance_q15 * S,
|
||||
const q15_t * pSrc,
|
||||
q15_t * pDst,
|
||||
uint32_t blockSize)
|
||||
{
|
||||
q15_t *pState = S->pState; /* State pointer */
|
||||
const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
|
||||
q15_t *pStateCurnt; /* Points to the current sample of the state */
|
||||
q15_t *px; /* Temporary pointer for state buffer */
|
||||
const q15_t *pb; /* Temporary pointer for coefficient buffer */
|
||||
q31_t acc0; /* Accumulators */
|
||||
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
|
||||
uint32_t tapCnt, blkCnt; /* Loop counters */
|
||||
|
||||
#if defined (ARM_MATH_LOOPUNROLL)
|
||||
q31_t acc1, acc2, acc3; /* Accumulators */
|
||||
q31_t x0, x1, x2, c0; /* Temporary variables to hold state and coefficient values */
|
||||
#endif
|
||||
|
||||
/* S->pState points to state array which contains previous frame (numTaps - 1) samples */
|
||||
/* pStateCurnt points to the location where the new input data should be written */
|
||||
pStateCurnt = &(S->pState[(numTaps - 1U)]);
|
||||
|
||||
#if defined (ARM_MATH_LOOPUNROLL)
|
||||
|
||||
/* Loop unrolling: Compute 4 output values simultaneously.
|
||||
* The variables acc0 ... acc3 hold output values that are being computed:
|
||||
*
|
||||
* acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
|
||||
* acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
|
||||
* acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
|
||||
* acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
|
||||
*/
|
||||
blkCnt = blockSize >> 2U;
|
||||
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/* Copy 4 new input samples into the state buffer. */
|
||||
*pStateCurnt++ = *pSrc++;
|
||||
*pStateCurnt++ = *pSrc++;
|
||||
*pStateCurnt++ = *pSrc++;
|
||||
*pStateCurnt++ = *pSrc++;
|
||||
|
||||
/* Set all accumulators to zero */
|
||||
acc0 = 0;
|
||||
acc1 = 0;
|
||||
acc2 = 0;
|
||||
acc3 = 0;
|
||||
|
||||
/* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */
|
||||
px = pState;
|
||||
|
||||
/* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */
|
||||
pb = pCoeffs;
|
||||
|
||||
/* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */
|
||||
x0 = read_q15x2_ia (&px);
|
||||
|
||||
/* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */
|
||||
x2 = read_q15x2_ia (&px);
|
||||
|
||||
/* Loop over the number of taps. Unroll by a factor of 4.
|
||||
Repeat until we've computed numTaps-(numTaps%4) coefficients. */
|
||||
tapCnt = numTaps >> 2U;
|
||||
|
||||
while (tapCnt > 0U)
|
||||
{
|
||||
/* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */
|
||||
c0 = read_q15x2_ia ((q15_t **) &pb);
|
||||
|
||||
/* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
|
||||
acc0 = __SMLAD(x0, c0, acc0);
|
||||
|
||||
/* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
|
||||
acc2 = __SMLAD(x2, c0, acc2);
|
||||
|
||||
/* pack x[n-N-1] and x[n-N-2] */
|
||||
#ifndef ARM_MATH_BIG_ENDIAN
|
||||
x1 = __PKHBT(x2, x0, 0);
|
||||
#else
|
||||
x1 = __PKHBT(x0, x2, 0);
|
||||
#endif
|
||||
|
||||
/* Read state x[n-N-4], x[n-N-5] */
|
||||
x0 = read_q15x2_ia (&px);
|
||||
|
||||
/* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
|
||||
acc1 = __SMLADX(x1, c0, acc1);
|
||||
|
||||
/* pack x[n-N-3] and x[n-N-4] */
|
||||
#ifndef ARM_MATH_BIG_ENDIAN
|
||||
x1 = __PKHBT(x0, x2, 0);
|
||||
#else
|
||||
x1 = __PKHBT(x2, x0, 0);
|
||||
#endif
|
||||
|
||||
/* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
|
||||
acc3 = __SMLADX(x1, c0, acc3);
|
||||
|
||||
/* Read coefficients b[N-2], b[N-3] */
|
||||
c0 = read_q15x2_ia ((q15_t **) &pb);
|
||||
|
||||
/* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
|
||||
acc0 = __SMLAD(x2, c0, acc0);
|
||||
|
||||
/* Read state x[n-N-6], x[n-N-7] with offset */
|
||||
x2 = read_q15x2_ia (&px);
|
||||
|
||||
/* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
|
||||
acc2 = __SMLAD(x0, c0, acc2);
|
||||
|
||||
/* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
|
||||
acc1 = __SMLADX(x1, c0, acc1);
|
||||
|
||||
/* pack x[n-N-5] and x[n-N-6] */
|
||||
#ifndef ARM_MATH_BIG_ENDIAN
|
||||
x1 = __PKHBT(x2, x0, 0);
|
||||
#else
|
||||
x1 = __PKHBT(x0, x2, 0);
|
||||
#endif
|
||||
|
||||
/* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
|
||||
acc3 = __SMLADX(x1, c0, acc3);
|
||||
|
||||
/* Decrement tap count */
|
||||
tapCnt--;
|
||||
}
|
||||
|
||||
/* If the filter length is not a multiple of 4, compute the remaining filter taps.
|
||||
This is always be 2 taps since the filter length is even. */
|
||||
if ((numTaps & 0x3U) != 0U)
|
||||
{
|
||||
/* Read last two coefficients */
|
||||
c0 = read_q15x2_ia ((q15_t **) &pb);
|
||||
|
||||
/* Perform the multiply-accumulates */
|
||||
acc0 = __SMLAD(x0, c0, acc0);
|
||||
acc2 = __SMLAD(x2, c0, acc2);
|
||||
|
||||
/* pack state variables */
|
||||
#ifndef ARM_MATH_BIG_ENDIAN
|
||||
x1 = __PKHBT(x2, x0, 0);
|
||||
#else
|
||||
x1 = __PKHBT(x0, x2, 0);
|
||||
#endif
|
||||
|
||||
/* Read last state variables */
|
||||
x0 = read_q15x2 (px);
|
||||
|
||||
/* Perform the multiply-accumulates */
|
||||
acc1 = __SMLADX(x1, c0, acc1);
|
||||
|
||||
/* pack state variables */
|
||||
#ifndef ARM_MATH_BIG_ENDIAN
|
||||
x1 = __PKHBT(x0, x2, 0);
|
||||
#else
|
||||
x1 = __PKHBT(x2, x0, 0);
|
||||
#endif
|
||||
|
||||
/* Perform the multiply-accumulates */
|
||||
acc3 = __SMLADX(x1, c0, acc3);
|
||||
}
|
||||
|
||||
/* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
|
||||
Then store the 4 outputs in the destination buffer. */
|
||||
#ifndef ARM_MATH_BIG_ENDIAN
|
||||
write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
|
||||
write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
|
||||
#else
|
||||
write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
|
||||
write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
|
||||
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
|
||||
|
||||
/* Advance the state pointer by 4 to process the next group of 4 samples */
|
||||
pState = pState + 4U;
|
||||
|
||||
/* Decrement loop counter */
|
||||
blkCnt--;
|
||||
}
|
||||
|
||||
/* Loop unrolling: Compute remaining output samples */
|
||||
blkCnt = blockSize % 0x4U;
|
||||
|
||||
#else
|
||||
|
||||
/* Initialize blkCnt with number of taps */
|
||||
blkCnt = blockSize;
|
||||
|
||||
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
|
||||
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/* Copy two samples into state buffer */
|
||||
*pStateCurnt++ = *pSrc++;
|
||||
|
||||
/* Set the accumulator to zero */
|
||||
acc0 = 0;
|
||||
|
||||
/* Use SIMD to hold states and coefficients */
|
||||
px = pState;
|
||||
pb = pCoeffs;
|
||||
|
||||
tapCnt = numTaps >> 1U;
|
||||
|
||||
do
|
||||
{
|
||||
acc0 += (q31_t) *px++ * *pb++;
|
||||
acc0 += (q31_t) *px++ * *pb++;
|
||||
|
||||
tapCnt--;
|
||||
}
|
||||
while (tapCnt > 0U);
|
||||
|
||||
/* The result is in 2.30 format. Convert to 1.15 with saturation.
|
||||
Then store the output in the destination buffer. */
|
||||
*pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
|
||||
|
||||
/* Advance state pointer by 1 for the next sample */
|
||||
pState = pState + 1U;
|
||||
|
||||
/* Decrement loop counter */
|
||||
blkCnt--;
|
||||
}
|
||||
|
||||
/* Processing is complete.
|
||||
Now copy the last numTaps - 1 samples to the start of the state buffer.
|
||||
This prepares the state buffer for the next function call. */
|
||||
|
||||
/* Points to the start of the state buffer */
|
||||
pStateCurnt = S->pState;
|
||||
|
||||
#if defined (ARM_MATH_LOOPUNROLL)
|
||||
|
||||
/* Loop unrolling: Compute 4 taps at a time */
|
||||
tapCnt = (numTaps - 1U) >> 2U;
|
||||
|
||||
/* Copy data */
|
||||
while (tapCnt > 0U)
|
||||
{
|
||||
*pStateCurnt++ = *pState++;
|
||||
*pStateCurnt++ = *pState++;
|
||||
*pStateCurnt++ = *pState++;
|
||||
*pStateCurnt++ = *pState++;
|
||||
|
||||
/* Decrement loop counter */
|
||||
tapCnt--;
|
||||
}
|
||||
|
||||
/* Calculate remaining number of copies */
|
||||
tapCnt = (numTaps - 1U) % 0x4U;
|
||||
|
||||
#else
|
||||
|
||||
/* Initialize tapCnt with number of taps */
|
||||
tapCnt = (numTaps - 1U);
|
||||
|
||||
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
|
||||
|
||||
/* Copy remaining data */
|
||||
while (tapCnt > 0U)
|
||||
{
|
||||
*pStateCurnt++ = *pState++;
|
||||
|
||||
/* Decrement loop counter */
|
||||
tapCnt--;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
@} end of FIR group
|
||||
*/
|
|
@ -0,0 +1,774 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
* Project: CMSIS DSP Library
|
||||
* Title: arm_fir_interpolate_q15.c
|
||||
* Description: Q15 FIR interpolation
|
||||
*
|
||||
* $Date: 23 April 2021
|
||||
* $Revision: V1.9.0
|
||||
*
|
||||
* Target Processor: Cortex-M and Cortex-A cores
|
||||
* -------------------------------------------------------------------- */
|
||||
/*
|
||||
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the License); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dsp/filtering_functions.h"
|
||||
|
||||
/**
|
||||
@ingroup groupFilters
|
||||
*/
|
||||
|
||||
/**
|
||||
@addtogroup FIR_Interpolate
|
||||
@{
|
||||
*/
|
||||
|
||||
/**
|
||||
@brief Processing function for the Q15 FIR interpolator.
|
||||
@param[in] S points to an instance of the Q15 FIR interpolator structure
|
||||
@param[in] pSrc points to the block of input data
|
||||
@param[out] pDst points to the block of output data
|
||||
@param[in] blockSize number of samples to process
|
||||
@return none
|
||||
|
||||
@par Scaling and Overflow Behavior
|
||||
The function is implemented using a 64-bit internal accumulator.
|
||||
Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
|
||||
The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
|
||||
There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
|
||||
After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
|
||||
Lastly, the accumulator is saturated to yield a result in 1.15 format.
|
||||
*/
|
||||
|
||||
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
|
||||
|
||||
#include "arm_helium_utils.h"
|
||||
__STATIC_INLINE void arm_fir_interpolate_q15(
|
||||
const arm_fir_interpolate_instance_q15 * S,
|
||||
const q15_t * pSrc,
|
||||
q15_t * pDst,
|
||||
uint32_t blockSize)
|
||||
{
|
||||
q15_t *pState = S->pState; /* State pointer */
|
||||
const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
|
||||
q15_t *pStateCurnt; /* Points to the current sample of the state */
|
||||
const q15_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
|
||||
|
||||
uint32_t i, blkCnt; /* Loop counters */
|
||||
uint16_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
|
||||
uint16_t strides[8] = {
|
||||
0, 1 * S->L, 2 * S->L, 3 * S->L,
|
||||
4 * S->L, 5 * S->L, 6 * S->L, 7 * S->L
|
||||
};
|
||||
uint16x8_t vec_strides0 = *(uint16x8_t *) strides;
|
||||
uint16x8_t vec_strides1 = vec_strides0 + 1;
|
||||
uint16x8_t vec_strides2 = vec_strides0 + 2;
|
||||
uint16x8_t vec_strides3 = vec_strides0 + 3;
|
||||
q15x8_t vecState, vecCoef;
|
||||
|
||||
/*
|
||||
* S->pState buffer contains previous frame (phaseLen - 1) samples
|
||||
* pStateCurnt points to the location where the new input data should be written
|
||||
*/
|
||||
pStateCurnt = S->pState + ((q15_t) phaseLen - 1);
|
||||
/*
|
||||
* Total number of intput samples
|
||||
*/
|
||||
blkCnt = blockSize;
|
||||
/*
|
||||
* Loop over the blockSize.
|
||||
*/
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/*
|
||||
* Copy new input sample into the state buffer
|
||||
*/
|
||||
*pStateCurnt++ = *pSrc++;
|
||||
/*
|
||||
* Loop over the Interpolation factor.
|
||||
*/
|
||||
i = S->L;
|
||||
while (i > 0U)
|
||||
{
|
||||
/*
|
||||
* Initialize state pointer
|
||||
*/
|
||||
ptr1 = pState;
|
||||
if (i >= 4)
|
||||
{
|
||||
/*
|
||||
* Initialize coefficient pointer
|
||||
*/
|
||||
ptr2 = pCoeffs + (i - 1 - 3U);
|
||||
|
||||
q63_t acc0 = 0LL;
|
||||
q63_t acc1 = 0LL;
|
||||
q63_t acc2 = 0LL;
|
||||
q63_t acc3 = 0LL;
|
||||
|
||||
uint32_t tapCnt = phaseLen >> 3;
|
||||
while (tapCnt > 0U)
|
||||
{
|
||||
vecState = vldrhq_s16(ptr1);
|
||||
|
||||
vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides3);
|
||||
acc0 = vmlaldavaq(acc0, vecState, vecCoef);
|
||||
|
||||
vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides2);
|
||||
acc1 = vmlaldavaq(acc1, vecState, vecCoef);
|
||||
|
||||
vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides1);
|
||||
acc2 = vmlaldavaq(acc2, vecState, vecCoef);
|
||||
|
||||
vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides0);
|
||||
acc3 = vmlaldavaq(acc3, vecState, vecCoef);
|
||||
|
||||
ptr1 += 8;
|
||||
ptr2 = ptr2 + S->L * 8;
|
||||
tapCnt--;
|
||||
}
|
||||
tapCnt = phaseLen & 7;
|
||||
if (tapCnt > 0U)
|
||||
{
|
||||
mve_pred16_t p0 = vctp16q(tapCnt);
|
||||
|
||||
vecState = vldrhq_z_s16(ptr1, p0);
|
||||
|
||||
vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides3, p0);
|
||||
acc0 = vmlaldavaq(acc0, vecState, vecCoef);
|
||||
|
||||
vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides2, p0);
|
||||
acc1 = vmlaldavaq(acc1, vecState, vecCoef);
|
||||
|
||||
vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides1, p0);
|
||||
acc2 = vmlaldavaq(acc2, vecState, vecCoef);
|
||||
|
||||
vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides0, p0);
|
||||
acc3 = vmlaldavaq(acc3, vecState, vecCoef);
|
||||
}
|
||||
|
||||
acc0 = asrl(acc0, 15);
|
||||
acc1 = asrl(acc1, 15);
|
||||
acc2 = asrl(acc2, 15);
|
||||
acc3 = asrl(acc3, 15);
|
||||
|
||||
*pDst++ = (q15_t) __SSAT(acc0, 16);
|
||||
*pDst++ = (q15_t) __SSAT(acc1, 16);
|
||||
*pDst++ = (q15_t) __SSAT(acc2, 16);
|
||||
*pDst++ = (q15_t) __SSAT(acc3, 16);
|
||||
i -= 4;
|
||||
}
|
||||
else if (i >= 3)
|
||||
{
|
||||
/*
|
||||
* Initialize coefficient pointer
|
||||
*/
|
||||
ptr2 = pCoeffs + (i - 1U - 2);
|
||||
|
||||
q63_t acc0 = 0LL;
|
||||
q63_t acc1 = 0LL;
|
||||
q63_t acc2 = 0LL;
|
||||
|
||||
uint32_t tapCnt = phaseLen >> 3;
|
||||
while (tapCnt > 0U)
|
||||
{
|
||||
vecState = vldrhq_s16(ptr1);
|
||||
|
||||
vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides2);
|
||||
acc0 = vmlaldavaq(acc0, vecState, vecCoef);
|
||||
|
||||
vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides1);
|
||||
acc1 = vmlaldavaq(acc1, vecState, vecCoef);
|
||||
|
||||
vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides0);
|
||||
acc2 = vmlaldavaq(acc2, vecState, vecCoef);
|
||||
|
||||
ptr1 += 8;
|
||||
ptr2 = ptr2 + S->L * 8;
|
||||
tapCnt--;
|
||||
}
|
||||
tapCnt = phaseLen & 7;
|
||||
if (tapCnt > 0U)
|
||||
{
|
||||
mve_pred16_t p0 = vctp16q(tapCnt);
|
||||
|
||||
vecState = vldrhq_z_s16(ptr1, p0);
|
||||
|
||||
vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides2, p0);
|
||||
acc0 = vmlaldavaq(acc0, vecState, vecCoef);
|
||||
|
||||
vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides1, p0);
|
||||
acc1 = vmlaldavaq(acc1, vecState, vecCoef);
|
||||
|
||||
vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides0, p0);
|
||||
acc2 = vmlaldavaq(acc2, vecState, vecCoef);
|
||||
}
|
||||
|
||||
acc0 = asrl(acc0, 15);
|
||||
acc1 = asrl(acc1, 15);
|
||||
acc2 = asrl(acc2, 15);
|
||||
|
||||
*pDst++ = (q15_t) __SSAT(acc0, 16);;
|
||||
*pDst++ = (q15_t) __SSAT(acc1, 16);;
|
||||
*pDst++ = (q15_t) __SSAT(acc2, 16);;
|
||||
i -= 3;
|
||||
}
|
||||
else if (i >= 2)
|
||||
{
|
||||
/*
|
||||
* Initialize coefficient pointer
|
||||
*/
|
||||
ptr2 = pCoeffs + (i - 1U - 1);
|
||||
|
||||
q63_t acc0 = 0LL;
|
||||
q63_t acc1 = 0LL;
|
||||
|
||||
uint32_t tapCnt = phaseLen >> 3;
|
||||
while (tapCnt > 0U)
|
||||
{
|
||||
vecState = vldrhq_s16(ptr1);
|
||||
|
||||
vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides1);
|
||||
acc0 = vmlaldavaq(acc0, vecState, vecCoef);
|
||||
|
||||
vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides0);
|
||||
acc1 = vmlaldavaq(acc1, vecState, vecCoef);
|
||||
|
||||
ptr1 += 8;
|
||||
ptr2 = ptr2 + S->L * 8;
|
||||
tapCnt--;
|
||||
}
|
||||
tapCnt = phaseLen & 7;
|
||||
if (tapCnt > 0U)
|
||||
{
|
||||
mve_pred16_t p0 = vctp16q(tapCnt);
|
||||
|
||||
vecState = vldrhq_z_s16(ptr1, p0);
|
||||
|
||||
vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides1, p0);
|
||||
acc0 = vmlaldavaq(acc0, vecState, vecCoef);
|
||||
|
||||
vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides0, p0);
|
||||
acc1 = vmlaldavaq(acc1, vecState, vecCoef);
|
||||
}
|
||||
|
||||
acc0 = asrl(acc0, 15);
|
||||
acc1 = asrl(acc1, 15);
|
||||
|
||||
*pDst++ = (q15_t) __SSAT(acc0, 16);
|
||||
*pDst++ = (q15_t) __SSAT(acc1, 16);
|
||||
i -= 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Initialize coefficient pointer
|
||||
*/
|
||||
ptr2 = pCoeffs + (i - 1U);
|
||||
|
||||
q63_t acc0 = 0LL;
|
||||
|
||||
uint32_t tapCnt = phaseLen >> 3;
|
||||
while (tapCnt > 0U)
|
||||
{
|
||||
vecState = vldrhq_s16(ptr1);
|
||||
vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides0);
|
||||
|
||||
acc0 = vmlaldavaq(acc0, vecState, vecCoef);
|
||||
|
||||
ptr1 += 8;
|
||||
ptr2 = ptr2 + S->L * 8;
|
||||
tapCnt--;
|
||||
}
|
||||
tapCnt = phaseLen & 7;
|
||||
if (tapCnt > 0U)
|
||||
{
|
||||
mve_pred16_t p0 = vctp16q(tapCnt);
|
||||
|
||||
vecState = vldrhq_z_s16(ptr1, p0);
|
||||
vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides0, p0);
|
||||
acc0 = vmlaldavaq(acc0, vecState, vecCoef);
|
||||
}
|
||||
|
||||
acc0 = asrl(acc0, 15);
|
||||
*pDst++ = (q15_t) __SSAT(acc0, 16);
|
||||
/*
|
||||
* Decrement the loop counter
|
||||
*/
|
||||
i--;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Advance the state pointer by 1
|
||||
* * to process the next group of interpolation factor number samples
|
||||
*/
|
||||
pState = pState + 1;
|
||||
/*
|
||||
* Decrement the loop counter
|
||||
*/
|
||||
blkCnt--;
|
||||
}
|
||||
|
||||
/*
|
||||
* Processing is complete.
|
||||
* Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
|
||||
* This prepares the state buffer for the next function call.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Points to the start of the state buffer
|
||||
*/
|
||||
pStateCurnt = S->pState;
|
||||
blkCnt = (phaseLen - 1U) >> 3;
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
vstrhq_s16(pStateCurnt, vldrhq_s16(pState));
|
||||
pState += 8;
|
||||
pStateCurnt += 8;
|
||||
blkCnt--;
|
||||
}
|
||||
blkCnt = (phaseLen - 1U) & 7;
|
||||
if (blkCnt > 0U)
|
||||
{
|
||||
mve_pred16_t p0 = vctp16q(blkCnt);
|
||||
vstrhq_p_s16(pStateCurnt, vldrhq_s16(pState), p0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
void arm_fir_interpolate_q15(
|
||||
const arm_fir_interpolate_instance_q15 * S,
|
||||
const q15_t * pSrc,
|
||||
q15_t * pDst,
|
||||
uint32_t blockSize)
|
||||
{
|
||||
#if (1)
|
||||
//#if !defined(ARM_MATH_CM0_FAMILY)
|
||||
|
||||
q15_t *pState = S->pState; /* State pointer */
|
||||
const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
|
||||
q15_t *pStateCur; /* Points to the current sample of the state */
|
||||
q15_t *ptr1; /* Temporary pointer for state buffer */
|
||||
const q15_t *ptr2; /* Temporary pointer for coefficient buffer */
|
||||
q63_t sum0; /* Accumulators */
|
||||
uint32_t i, blkCnt, tapCnt; /* Loop counters */
|
||||
uint32_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
|
||||
uint32_t j;
|
||||
|
||||
#if defined (ARM_MATH_LOOPUNROLL)
|
||||
q63_t acc0, acc1, acc2, acc3;
|
||||
q15_t x0, x1, x2, x3;
|
||||
q15_t c0, c1, c2, c3;
|
||||
#endif
|
||||
|
||||
/* S->pState buffer contains previous frame (phaseLen - 1) samples */
|
||||
/* pStateCur points to the location where the new input data should be written */
|
||||
pStateCur = S->pState + (phaseLen - 1U);
|
||||
|
||||
#if defined (ARM_MATH_LOOPUNROLL)
|
||||
|
||||
/* Loop unrolling: Compute 4 outputs at a time */
|
||||
blkCnt = blockSize >> 2U;
|
||||
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/* Copy new input sample into the state buffer */
|
||||
*pStateCur++ = *pSrc++;
|
||||
*pStateCur++ = *pSrc++;
|
||||
*pStateCur++ = *pSrc++;
|
||||
*pStateCur++ = *pSrc++;
|
||||
|
||||
/* Address modifier index of coefficient buffer */
|
||||
j = 1U;
|
||||
|
||||
/* Loop over the Interpolation factor. */
|
||||
i = (S->L);
|
||||
|
||||
while (i > 0U)
|
||||
{
|
||||
/* Set accumulator to zero */
|
||||
acc0 = 0;
|
||||
acc1 = 0;
|
||||
acc2 = 0;
|
||||
acc3 = 0;
|
||||
|
||||
/* Initialize state pointer */
|
||||
ptr1 = pState;
|
||||
|
||||
/* Initialize coefficient pointer */
|
||||
ptr2 = pCoeffs + (S->L - j);
|
||||
|
||||
/* Loop over the polyPhase length. Unroll by a factor of 4.
|
||||
Repeat until we've computed numTaps-(4*S->L) coefficients. */
|
||||
tapCnt = phaseLen >> 2U;
|
||||
|
||||
x0 = *(ptr1++);
|
||||
x1 = *(ptr1++);
|
||||
x2 = *(ptr1++);
|
||||
|
||||
while (tapCnt > 0U)
|
||||
{
|
||||
/* Read the input sample */
|
||||
x3 = *(ptr1++);
|
||||
|
||||
/* Read the coefficient */
|
||||
c0 = *(ptr2);
|
||||
|
||||
/* Perform the multiply-accumulate */
|
||||
acc0 += (q63_t) x0 * c0;
|
||||
acc1 += (q63_t) x1 * c0;
|
||||
acc2 += (q63_t) x2 * c0;
|
||||
acc3 += (q63_t) x3 * c0;
|
||||
|
||||
/* Read the coefficient */
|
||||
c1 = *(ptr2 + S->L);
|
||||
|
||||
/* Read the input sample */
|
||||
x0 = *(ptr1++);
|
||||
|
||||
/* Perform the multiply-accumulate */
|
||||
acc0 += (q63_t) x1 * c1;
|
||||
acc1 += (q63_t) x2 * c1;
|
||||
acc2 += (q63_t) x3 * c1;
|
||||
acc3 += (q63_t) x0 * c1;
|
||||
|
||||
/* Read the coefficient */
|
||||
c2 = *(ptr2 + S->L * 2);
|
||||
|
||||
/* Read the input sample */
|
||||
x1 = *(ptr1++);
|
||||
|
||||
/* Perform the multiply-accumulate */
|
||||
acc0 += (q63_t) x2 * c2;
|
||||
acc1 += (q63_t) x3 * c2;
|
||||
acc2 += (q63_t) x0 * c2;
|
||||
acc3 += (q63_t) x1 * c2;
|
||||
|
||||
/* Read the coefficient */
|
||||
c3 = *(ptr2 + S->L * 3);
|
||||
|
||||
/* Read the input sample */
|
||||
x2 = *(ptr1++);
|
||||
|
||||
/* Perform the multiply-accumulate */
|
||||
acc0 += (q63_t) x3 * c3;
|
||||
acc1 += (q63_t) x0 * c3;
|
||||
acc2 += (q63_t) x1 * c3;
|
||||
acc3 += (q63_t) x2 * c3;
|
||||
|
||||
|
||||
/* Upsampling is done by stuffing L-1 zeros between each sample.
|
||||
* So instead of multiplying zeros with coefficients,
|
||||
* Increment the coefficient pointer by interpolation factor times. */
|
||||
ptr2 += 4 * S->L;
|
||||
|
||||
/* Decrement loop counter */
|
||||
tapCnt--;
|
||||
}
|
||||
|
||||
/* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
|
||||
tapCnt = phaseLen % 0x4U;
|
||||
|
||||
while (tapCnt > 0U)
|
||||
{
|
||||
/* Read the input sample */
|
||||
x3 = *(ptr1++);
|
||||
|
||||
/* Read the coefficient */
|
||||
c0 = *(ptr2);
|
||||
|
||||
/* Perform the multiply-accumulate */
|
||||
acc0 += (q63_t) x0 * c0;
|
||||
acc1 += (q63_t) x1 * c0;
|
||||
acc2 += (q63_t) x2 * c0;
|
||||
acc3 += (q63_t) x3 * c0;
|
||||
|
||||
/* Increment the coefficient pointer by interpolation factor times. */
|
||||
ptr2 += S->L;
|
||||
|
||||
/* update states for next sample processing */
|
||||
x0 = x1;
|
||||
x1 = x2;
|
||||
x2 = x3;
|
||||
|
||||
/* Decrement loop counter */
|
||||
tapCnt--;
|
||||
}
|
||||
|
||||
/* The result is in the accumulator, store in the destination buffer. */
|
||||
*(pDst ) = (q15_t) (__SSAT((acc0 >> 15), 16));
|
||||
*(pDst + S->L) = (q15_t) (__SSAT((acc1 >> 15), 16));
|
||||
*(pDst + 2 * S->L) = (q15_t) (__SSAT((acc2 >> 15), 16));
|
||||
*(pDst + 3 * S->L) = (q15_t) (__SSAT((acc3 >> 15), 16));
|
||||
|
||||
pDst++;
|
||||
|
||||
/* Increment the address modifier index of coefficient buffer */
|
||||
j++;
|
||||
|
||||
/* Decrement loop counter */
|
||||
i--;
|
||||
}
|
||||
|
||||
/* Advance the state pointer by 1
|
||||
* to process the next group of interpolation factor number samples */
|
||||
pState = pState + 4;
|
||||
|
||||
pDst += S->L * 3;
|
||||
|
||||
/* Decrement loop counter */
|
||||
blkCnt--;
|
||||
}
|
||||
|
||||
/* Loop unrolling: Compute remaining outputs */
|
||||
blkCnt = blockSize % 0x4U;
|
||||
|
||||
#else
|
||||
|
||||
/* Initialize blkCnt with number of samples */
|
||||
blkCnt = blockSize;
|
||||
|
||||
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
|
||||
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/* Copy new input sample into the state buffer */
|
||||
*pStateCur++ = *pSrc++;
|
||||
|
||||
/* Address modifier index of coefficient buffer */
|
||||
j = 1U;
|
||||
|
||||
/* Loop over the Interpolation factor. */
|
||||
i = S->L;
|
||||
while (i > 0U)
|
||||
{
|
||||
/* Set accumulator to zero */
|
||||
sum0 = 0;
|
||||
|
||||
/* Initialize state pointer */
|
||||
ptr1 = pState;
|
||||
|
||||
/* Initialize coefficient pointer */
|
||||
ptr2 = pCoeffs + (S->L - j);
|
||||
|
||||
/* Loop over the polyPhase length.
|
||||
Repeat until we've computed numTaps-(4*S->L) coefficients. */
|
||||
|
||||
#if defined (ARM_MATH_LOOPUNROLL)
|
||||
|
||||
/* Loop unrolling: Compute 4 outputs at a time */
|
||||
tapCnt = phaseLen >> 2U;
|
||||
|
||||
while (tapCnt > 0U)
|
||||
{
|
||||
/* Perform the multiply-accumulate */
|
||||
sum0 += (q63_t) *ptr1++ * *ptr2;
|
||||
|
||||
/* Upsampling is done by stuffing L-1 zeros between each sample.
|
||||
* So instead of multiplying zeros with coefficients,
|
||||
* Increment the coefficient pointer by interpolation factor times. */
|
||||
ptr2 += S->L;
|
||||
|
||||
sum0 += (q63_t) *ptr1++ * *ptr2;
|
||||
ptr2 += S->L;
|
||||
|
||||
sum0 += (q63_t) *ptr1++ * *ptr2;
|
||||
ptr2 += S->L;
|
||||
|
||||
sum0 += (q63_t) *ptr1++ * *ptr2;
|
||||
ptr2 += S->L;
|
||||
|
||||
/* Decrement loop counter */
|
||||
tapCnt--;
|
||||
}
|
||||
|
||||
/* Loop unrolling: Compute remaining outputs */
|
||||
tapCnt = phaseLen % 0x4U;
|
||||
|
||||
#else
|
||||
|
||||
/* Initialize tapCnt with number of samples */
|
||||
tapCnt = phaseLen;
|
||||
|
||||
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
|
||||
|
||||
while (tapCnt > 0U)
|
||||
{
|
||||
/* Perform the multiply-accumulate */
|
||||
sum0 += (q63_t) *ptr1++ * *ptr2;
|
||||
|
||||
/* Upsampling is done by stuffing L-1 zeros between each sample.
|
||||
* So instead of multiplying zeros with coefficients,
|
||||
* Increment the coefficient pointer by interpolation factor times. */
|
||||
ptr2 += S->L;
|
||||
|
||||
/* Decrement loop counter */
|
||||
tapCnt--;
|
||||
}
|
||||
|
||||
/* The result is in the accumulator, store in the destination buffer. */
|
||||
*pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
|
||||
|
||||
/* Increment the address modifier index of coefficient buffer */
|
||||
j++;
|
||||
|
||||
/* Decrement the loop counter */
|
||||
i--;
|
||||
}
|
||||
|
||||
/* Advance the state pointer by 1
|
||||
* to process the next group of interpolation factor number samples */
|
||||
pState = pState + 1;
|
||||
|
||||
/* Decrement the loop counter */
|
||||
blkCnt--;
|
||||
}
|
||||
|
||||
/* Processing is complete.
|
||||
Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
|
||||
This prepares the state buffer for the next function call. */
|
||||
|
||||
/* Points to the start of the state buffer */
|
||||
pStateCur = S->pState;
|
||||
|
||||
#if defined (ARM_MATH_LOOPUNROLL)
|
||||
|
||||
/* Loop unrolling: Compute 4 outputs at a time */
|
||||
tapCnt = (phaseLen - 1U) >> 2U;
|
||||
|
||||
/* copy data */
|
||||
while (tapCnt > 0U)
|
||||
{
|
||||
write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
|
||||
write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
|
||||
|
||||
/* Decrement loop counter */
|
||||
tapCnt--;
|
||||
}
|
||||
|
||||
/* Loop unrolling: Compute remaining outputs */
|
||||
tapCnt = (phaseLen - 1U) % 0x04U;
|
||||
|
||||
#else
|
||||
|
||||
/* Initialize tapCnt with number of samples */
|
||||
tapCnt = (phaseLen - 1U);
|
||||
|
||||
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
|
||||
|
||||
/* Copy data */
|
||||
while (tapCnt > 0U)
|
||||
{
|
||||
*pStateCur++ = *pState++;
|
||||
|
||||
/* Decrement loop counter */
|
||||
tapCnt--;
|
||||
}
|
||||
|
||||
#else
|
||||
/* alternate version for CM0_FAMILY */
|
||||
|
||||
q15_t *pState = S->pState; /* State pointer */
|
||||
const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
|
||||
q15_t *pStateCur; /* Points to the current sample of the state */
|
||||
q15_t *ptr1; /* Temporary pointer for state buffer */
|
||||
const q15_t *ptr2; /* Temporary pointer for coefficient buffer */
|
||||
q63_t sum0; /* Accumulators */
|
||||
uint32_t i, blkCnt, tapCnt; /* Loop counters */
|
||||
uint32_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
|
||||
|
||||
/* S->pState buffer contains previous frame (phaseLen - 1) samples */
|
||||
/* pStateCur points to the location where the new input data should be written */
|
||||
pStateCur = S->pState + (phaseLen - 1U);
|
||||
|
||||
/* Total number of intput samples */
|
||||
blkCnt = blockSize;
|
||||
|
||||
/* Loop over the blockSize. */
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/* Copy new input sample into the state buffer */
|
||||
*pStateCur++ = *pSrc++;
|
||||
|
||||
/* Loop over the Interpolation factor. */
|
||||
i = S->L;
|
||||
|
||||
while (i > 0U)
|
||||
{
|
||||
/* Set accumulator to zero */
|
||||
sum0 = 0;
|
||||
|
||||
/* Initialize state pointer */
|
||||
ptr1 = pState;
|
||||
|
||||
/* Initialize coefficient pointer */
|
||||
ptr2 = pCoeffs + (i - 1U);
|
||||
|
||||
/* Loop over the polyPhase length */
|
||||
tapCnt = phaseLen;
|
||||
|
||||
while (tapCnt > 0U)
|
||||
{
|
||||
/* Perform the multiply-accumulate */
|
||||
sum0 += ((q63_t) *ptr1++ * *ptr2);
|
||||
|
||||
/* Increment the coefficient pointer by interpolation factor times. */
|
||||
ptr2 += S->L;
|
||||
|
||||
/* Decrement the loop counter */
|
||||
tapCnt--;
|
||||
}
|
||||
|
||||
/* Store the result after converting to 1.15 format in the destination buffer. */
|
||||
*pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
|
||||
|
||||
/* Decrement loop counter */
|
||||
i--;
|
||||
}
|
||||
|
||||
/* Advance the state pointer by 1
|
||||
* to process the next group of interpolation factor number samples */
|
||||
pState = pState + 1;
|
||||
|
||||
/* Decrement loop counter */
|
||||
blkCnt--;
|
||||
}
|
||||
|
||||
/* Processing is complete.
|
||||
** Now copy the last phaseLen - 1 samples to the start of the state buffer.
|
||||
** This prepares the state buffer for the next function call. */
|
||||
|
||||
/* Points to the start of the state buffer */
|
||||
pStateCur = S->pState;
|
||||
|
||||
tapCnt = phaseLen - 1U;
|
||||
|
||||
/* Copy data */
|
||||
while (tapCnt > 0U)
|
||||
{
|
||||
*pStateCur++ = *pState++;
|
||||
|
||||
/* Decrement loop counter */
|
||||
tapCnt--;
|
||||
}
|
||||
|
||||
#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
|
||||
|
||||
}
|
||||
#endif /* defined(ARM_MATH_MVEI)*/
|
||||
/**
|
||||
@} end of FIR_Interpolate group
|
||||
*/
|
|
@ -0,0 +1,181 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
* Project: CMSIS DSP Library
|
||||
* Title: arm_q15_to_q31.c
|
||||
* Description: Converts the elements of the Q15 vector to Q31 vector
|
||||
*
|
||||
* $Date: 23 April 2021
|
||||
* $Revision: V1.9.0
|
||||
*
|
||||
* Target Processor: Cortex-M and Cortex-A cores
|
||||
* -------------------------------------------------------------------- */
|
||||
/*
|
||||
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the License); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dsp/support_functions.h"
|
||||
|
||||
/**
|
||||
@ingroup groupSupport
|
||||
*/
|
||||
|
||||
/**
|
||||
@addtogroup q15_to_x
|
||||
@{
|
||||
*/
|
||||
|
||||
/**
|
||||
@brief Converts the elements of the Q15 vector to Q31 vector.
|
||||
@param[in] pSrc points to the Q15 input vector
|
||||
@param[out] pDst points to the Q31 output vector
|
||||
@param[in] blockSize number of samples in each vector
|
||||
|
||||
@par Details
|
||||
The equation used for the conversion process is:
|
||||
<pre>
|
||||
pDst[n] = (q31_t) pSrc[n] << 16; 0 <= n < blockSize.
|
||||
</pre>
|
||||
*/
|
||||
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
|
||||
__STATIC_INLINE void arm_q15_to_q31(
|
||||
const q15_t * pSrc,
|
||||
q31_t * pDst,
|
||||
uint32_t blockSize)
|
||||
{
|
||||
|
||||
uint32_t blkCnt;
|
||||
|
||||
q31x4_t vecDst;
|
||||
|
||||
blkCnt = blockSize>> 2;
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
|
||||
/* C = (q31_t)A << 16 */
|
||||
/* convert from q15 to q31 and then store the results in the destination buffer */
|
||||
/* load q15 + 32-bit widening */
|
||||
vecDst = vldrhq_s32((q15_t const *) pSrc);
|
||||
vecDst = vshlq_n(vecDst, 16);
|
||||
vstrwq_s32(pDst, vecDst);
|
||||
|
||||
/*
|
||||
* Decrement the blockSize loop counter
|
||||
* Advance vector source and destination pointers
|
||||
*/
|
||||
pDst += 4;
|
||||
pSrc += 4;
|
||||
blkCnt --;
|
||||
}
|
||||
|
||||
blkCnt = blockSize & 3;
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/* C = (q31_t) A << 16 */
|
||||
|
||||
/* Convert from q15 to q31 and store result in destination buffer */
|
||||
*pDst++ = (q31_t) *pSrc++ << 16;
|
||||
|
||||
/* Decrement loop counter */
|
||||
blkCnt--;
|
||||
}
|
||||
}
|
||||
#else
|
||||
void arm_q15_to_q31(
|
||||
const q15_t * pSrc,
|
||||
q31_t * pDst,
|
||||
uint32_t blockSize)
|
||||
{
|
||||
uint32_t blkCnt; /* Loop counter */
|
||||
const q15_t *pIn = pSrc; /* Source pointer */
|
||||
|
||||
#if defined (ARM_MATH_LOOPUNROLL)
|
||||
q31_t in1, in2;
|
||||
q31_t out1, out2, out3, out4;
|
||||
#endif
|
||||
|
||||
#if defined (ARM_MATH_LOOPUNROLL)
|
||||
|
||||
/* Loop unrolling: Compute 4 outputs at a time */
|
||||
blkCnt = blockSize >> 2U;
|
||||
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/* C = (q31_t)A << 16 */
|
||||
|
||||
/* Convert from q15 to q31 and store result in destination buffer */
|
||||
in1 = read_q15x2_ia (&pIn);
|
||||
in2 = read_q15x2_ia (&pIn);
|
||||
|
||||
#ifndef ARM_MATH_BIG_ENDIAN
|
||||
|
||||
/* extract lower 16 bits to 32 bit result */
|
||||
out1 = in1 << 16U;
|
||||
/* extract upper 16 bits to 32 bit result */
|
||||
out2 = in1 & 0xFFFF0000;
|
||||
/* extract lower 16 bits to 32 bit result */
|
||||
out3 = in2 << 16U;
|
||||
/* extract upper 16 bits to 32 bit result */
|
||||
out4 = in2 & 0xFFFF0000;
|
||||
|
||||
#else
|
||||
|
||||
/* extract upper 16 bits to 32 bit result */
|
||||
out1 = in1 & 0xFFFF0000;
|
||||
/* extract lower 16 bits to 32 bit result */
|
||||
out2 = in1 << 16U;
|
||||
/* extract upper 16 bits to 32 bit result */
|
||||
out3 = in2 & 0xFFFF0000;
|
||||
/* extract lower 16 bits to 32 bit result */
|
||||
out4 = in2 << 16U;
|
||||
|
||||
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
|
||||
|
||||
*pDst++ = out1;
|
||||
*pDst++ = out2;
|
||||
*pDst++ = out3;
|
||||
*pDst++ = out4;
|
||||
|
||||
/* Decrement loop counter */
|
||||
blkCnt--;
|
||||
}
|
||||
|
||||
/* Loop unrolling: Compute remaining outputs */
|
||||
blkCnt = blockSize % 0x4U;
|
||||
|
||||
#else
|
||||
|
||||
/* Initialize blkCnt with number of samples */
|
||||
blkCnt = blockSize;
|
||||
|
||||
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
|
||||
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/* C = (q31_t) A << 16 */
|
||||
|
||||
/* Convert from q15 to q31 and store result in destination buffer */
|
||||
*pDst++ = (q31_t) *pIn++ << 16;
|
||||
|
||||
/* Decrement loop counter */
|
||||
blkCnt--;
|
||||
}
|
||||
|
||||
}
|
||||
#endif /* defined(ARM_MATH_MVEI) */
|
||||
|
||||
/**
|
||||
@} end of q15_to_x group
|
||||
*/
|
|
@ -0,0 +1,82 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
* Project: CMSIS DSP Library
|
||||
* Title: arm_sin_q31.c
|
||||
* Description: Fast sine calculation for Q31 values
|
||||
*
|
||||
* $Date: 23 April 2021
|
||||
* $Revision: V1.9.0
|
||||
*
|
||||
* Target Processor: Cortex-M and Cortex-A cores
|
||||
* -------------------------------------------------------------------- */
|
||||
/*
|
||||
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the License); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dsp/fast_math_functions.h"
|
||||
#include "arm_common_tables.h"
|
||||
|
||||
/**
|
||||
@ingroup groupFastMath
|
||||
*/
|
||||
|
||||
/**
|
||||
@addtogroup sin
|
||||
@{
|
||||
*/
|
||||
|
||||
/**
|
||||
@brief Fast approximation to the trigonometric sine function for Q31 data.
|
||||
@param[in] x Scaled input value in radians
|
||||
@return sin(x)
|
||||
|
||||
The Q31 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*PI).
|
||||
*/
|
||||
|
||||
q31_t arm_sin_q31(
|
||||
q31_t x)
|
||||
{
|
||||
q31_t sinVal; /* Temporary variables for input, output */
|
||||
int32_t index; /* Index variable */
|
||||
q31_t a, b; /* Two nearest output values */
|
||||
q31_t fract; /* Temporary values for fractional values */
|
||||
|
||||
if (x < 0)
|
||||
{ /* convert negative numbers to corresponding positive ones */
|
||||
x = (uint32_t)x + 0x80000000;
|
||||
}
|
||||
|
||||
/* Calculate the nearest index */
|
||||
index = (uint32_t)x >> FAST_MATH_Q31_SHIFT;
|
||||
|
||||
/* Calculation of fractional value */
|
||||
fract = (x - (index << FAST_MATH_Q31_SHIFT)) << 9;
|
||||
|
||||
/* Read two nearest values of input value from the sin table */
|
||||
a = sinTable_q31[index];
|
||||
b = sinTable_q31[index+1];
|
||||
|
||||
/* Linear interpolation process */
|
||||
sinVal = (q63_t) (0x80000000 - fract) * a >> 32;
|
||||
sinVal = (q31_t) ((((q63_t) sinVal << 32) + ((q63_t) fract * b)) >> 32);
|
||||
|
||||
/* Return output value */
|
||||
return (sinVal << 1);
|
||||
}
|
||||
|
||||
/**
|
||||
@} end of sin group
|
||||
*/
|
4
src/IO.h
4
src/IO.h
|
@ -58,10 +58,6 @@ public:
|
|||
void resetWatchdog();
|
||||
uint32_t getWatchdog();
|
||||
|
||||
uint8_t getCPU() const;
|
||||
|
||||
void getUDID(uint8_t* buffer);
|
||||
|
||||
void selfTest();
|
||||
|
||||
private:
|
||||
|
|
|
@ -28,6 +28,10 @@
|
|||
#include "Version.h"
|
||||
#include "IOPins.h"
|
||||
|
||||
#if defined(I2C_REPEATER)
|
||||
#include "Wire.h"
|
||||
#endif
|
||||
|
||||
const uint8_t MMDVM_FRAME_START = 0xE0U;
|
||||
|
||||
const uint8_t MMDVM_GET_VERSION = 0x00U;
|
||||
|
@ -98,14 +102,14 @@ const uint8_t MMDVM_DEBUG4 = 0xF4U;
|
|||
const uint8_t MMDVM_DEBUG5 = 0xF5U;
|
||||
const uint8_t MMDVM_DEBUG_DUMP = 0xFAU;
|
||||
|
||||
#if defined(DRCC_DVM_NQF)
|
||||
#define HW_TYPE "MMDVM DRCC_DVM_NQF"
|
||||
#elif defined(DRCC_DVM_HHP446)
|
||||
#define HW_TYPE "MMDVM DRCC_DVM_HHP(446)"
|
||||
#elif defined(DRCC_DVM_722)
|
||||
#define HW_TYPE "MMDVM RB_STM32_DVM(722)"
|
||||
#elif defined(DRCC_DVM_446)
|
||||
#define HW_TYPE "MMDVM RB_STM32_DVM(446)"
|
||||
#if defined(ZUM_V09_PI)
|
||||
#define HW_TYPE "MMDVM ZUM_V0.9 (446)"
|
||||
#elif defined(ZUM_V10_PI)
|
||||
#define HW_TYPE "MMDVM ZUM_V1.0 (722)"
|
||||
#elif defined(RB_V3_PI)
|
||||
#define HW_TYPE "MMDVM RB_DVM_V3 (446)"
|
||||
#elif defined(RB_V5_PI)
|
||||
#define HW_TYPE "MMDVM RB_DVM_V5 (722)"
|
||||
#else
|
||||
#define HW_TYPE "MMDVM"
|
||||
#endif
|
||||
|
@ -118,7 +122,7 @@ const char HARDWARE[] = concat(HW_TYPE, VERSION, GITVERSION);
|
|||
const char HARDWARE[] = concat(HW_TYPE, VERSION, __TIME__, __DATE__);
|
||||
#endif
|
||||
|
||||
const uint8_t PROTOCOL_VERSION = 2U;
|
||||
const uint8_t PROTOCOL_VERSION = 2U;
|
||||
|
||||
// Parameters for batching serial data
|
||||
const int MAX_SERIAL_DATA = 250;
|
||||
|
@ -129,6 +133,9 @@ m_host(PIN_HOST_RXD, PIN_HOST_TXD),
|
|||
#if defined(SERIAL_REPEATER)
|
||||
m_rpt(PIN_RPT_RXD, PIN_RPT_TXD),
|
||||
#endif
|
||||
#if defined(I2C_REPEATER)
|
||||
m_i2CData(),
|
||||
#endif
|
||||
m_buffer(),
|
||||
m_ptr(0U),
|
||||
m_len(0U),
|
||||
|
@ -136,9 +143,6 @@ m_debug(false),
|
|||
m_serialData(),
|
||||
m_lastSerialAvail(0),
|
||||
m_lastSerialAvailCount(0U)
|
||||
#if defined(I2C_REPEATER)
|
||||
,m_i2CData()
|
||||
#endif
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -346,11 +350,17 @@ void CSerialPort::getVersion()
|
|||
#endif
|
||||
|
||||
// CPU type/manufacturer. 0=Atmel ARM, 1=NXP ARM, 2=St-Micro ARM
|
||||
reply[6U] = io.getCPU();
|
||||
reply[6U] = 2U;
|
||||
|
||||
// Reserve 16 bytes for the UDID
|
||||
::memset(reply + 7U, 0x00U, 16U);
|
||||
io.getUDID(reply + 7U);
|
||||
#if defined(ZUM_V09_PI) || defined(RB_V3_PI)
|
||||
::memcpy(reply + 7U, (void *)0x1FFF7A10, 12U);
|
||||
#elif defined(ZUM_V10_PI) || defined(RB_V5_PI)
|
||||
::memcpy(reply + 7U, (void *)0x1FF07A10, 12U);
|
||||
#else
|
||||
::memcpy(reply + 7U, "????????????", 12U);
|
||||
#endif
|
||||
|
||||
uint8_t count = 23U;
|
||||
for (uint8_t i = 0U; HARDWARE[i] != 0x00U; i++, count++)
|
||||
|
@ -898,7 +908,8 @@ void CSerialPort::start()
|
|||
#endif
|
||||
#endif
|
||||
#if defined(I2C_REPEATER)
|
||||
m_rpt.begin(9600);
|
||||
Wire.begin();
|
||||
Wire.setClock(100000);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -989,14 +1000,14 @@ void CSerialPort::process()
|
|||
// Write any outgoing serial data
|
||||
uint16_t i2CSpace = m_i2CData.getData();
|
||||
if (i2CSpace > 0U) {
|
||||
int avail = m_rpt.available();
|
||||
int avail = Wire.available();
|
||||
if (avail < i2CSpace)
|
||||
i2CSpace = avail;
|
||||
|
||||
for (uint16_t i = 0U; i < i2CSpace; i++) {
|
||||
uint8_t c = 0U;
|
||||
m_i2CData.get(c);
|
||||
m_rpt.write(&c, 1U);
|
||||
Wire.write(&c, 1U);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
#include "RingBuffer.h"
|
||||
|
||||
#if !defined(SERIAL_SPEED)
|
||||
#define SERIAL_SPEED 115200
|
||||
#define SERIAL_SPEED 460800
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -103,6 +103,9 @@ private:
|
|||
HardwareSerial m_host;
|
||||
#if defined(SERIAL_REPEATER)
|
||||
HardwareSerial m_rpt;
|
||||
#endif
|
||||
#if defined(I2C_REPEATER)
|
||||
CRingBuffer<uint8_t> m_i2CData;
|
||||
#endif
|
||||
uint8_t m_buffer[512U];
|
||||
uint16_t m_ptr;
|
||||
|
@ -111,9 +114,6 @@ private:
|
|||
CRingBuffer<uint8_t> m_serialData;
|
||||
int m_lastSerialAvail;
|
||||
uint16_t m_lastSerialAvailCount;
|
||||
#if defined(I2C_REPEATER)
|
||||
CRingBuffer<uint8_t> m_i2CData;
|
||||
#endif
|
||||
|
||||
void sendACK(uint8_t type);
|
||||
void sendNAK(uint8_t type, uint8_t err);
|
||||
|
|
Loading…
Reference in New Issue