Get it compiling but still incomplete.

2024-10-07 18:18:43 +01:00 · 2024-10-07 18:18:43 +01:00 · 05d1e7e9dc
parent d9932fe5c0
commit 05d1e7e9dc
12 changed files with 73517 additions and 40 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,3 @@
-*.o
-obj/
-bin/
-GitVersion.h
-build/
+.pio
 .vscode

--- a/platformio.ini
+++ b/platformio.ini
@ -15,13 +15,10 @@ framework = arduino
 build_unflags = -O0
 monitor_speed = 460800
 build_flags = 
-    -O3
-    -Wall
-    ; -D ARM_MATH_CM7
-    ; -mfpu=fpv5-sp-d16
-    -D ARM_MATH_CM4
-    -mfpu=fpv4-sp-d16
-    -mfloat-abi=hard
-    ; increase the buffer sizes
-    -D SERIAL_RX_BUFFER_SIZE=2048
-    -D SERIAL_TX_BUFFER_SIZE=2048
+	-O3
+	-Wall
+	-D ARM_MATH_CM4
+	-mfpu=fpv4-sp-d16
+	-mfloat-abi=hard
+	-D SERIAL_RX_BUFFER_SIZE=2048
+	-D SERIAL_TX_BUFFER_SIZE=2048
--- a/src/CMSIS/arm_biquad_cascade_df1_q31.c
+++ b/src/CMSIS/arm_biquad_cascade_df1_q31.c
@ -0,0 +1,509 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_biquad_cascade_df1_q31.c
+ * Description:  Processing function for the Q31 Biquad cascade filter
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+  @ingroup groupFilters
+ */
+
+/**
+  @addtogroup BiquadCascadeDF1
+  @{
+ */
+
+/**
+  @brief         Processing function for the Q31 Biquad cascade filter.
+  @param[in]     S         points to an instance of the Q31 Biquad cascade structure
+  @param[in]     pSrc      points to the block of input data
+  @param[out]    pDst      points to the block of output data
+  @param[in]     blockSize  number of samples to process
+
+  @par           Scaling and Overflow Behavior
+                   The function is implemented using an internal 64-bit accumulator.
+                   The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+                   Thus, if the accumulator result overflows it wraps around rather than clip.
+                   In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
+                   After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by <code>postShift</code> bits and the result truncated to
+                   1.31 format by discarding the low 32 bits.
+  @remark
+                   Refer to \ref arm_biquad_cascade_df1_fast_q31() for a faster but less precise implementation of this filter.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+__STATIC_INLINE void arm_biquad_cascade_df1_q31(
+  const arm_biquad_casd_df1_inst_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+    const q31_t    *pIn = pSrc; /*  input pointer initialization  */
+    q31_t          *pOut = pDst;        /*  output pointer initialization */
+    int             shift;
+    uint32_t        stages = S->numStages;      /*  loop counters                 */
+    int             postShift = S->postShift;
+    q31x4_t         b0Coeffs, b1Coeffs, a0Coeffs, a1Coeffs;     /*  Coefficients vector           */
+    q31x4_t         stateVec = { 0 };
+    q31_t          *pState = S->pState; /*  pState pointer initialization */
+    q31x4_t         inVec0;
+    int64_t         acc;
+    const q31_t          *pCoeffs = S->pCoeffs;       /*  coeff pointer initialization  */
+    q31_t           out, out1;
+
+
+    shift = (postShift + 1 + 8);
+
+    do {
+        /*
+         * Reading the coefficients
+         * generates :
+         * Fwd0 { b2  b1  b0  0  }
+         * Fwd1 { 0   b2  b1  b0 }
+         * Bwd0 { 0   0   a2  a1 }
+         * Bwd0 { 0   0   a1  a2 }
+         * (can be moved in init)
+         */
+        b0Coeffs = vdupq_n_s32(0);
+        a0Coeffs = vdupq_n_s32(0);
+
+        b0Coeffs[0] = pCoeffs[2];       // b2
+        b0Coeffs[1] = pCoeffs[1];       // b1
+        b0Coeffs[2] = pCoeffs[0];       // b0
+
+        b1Coeffs = b0Coeffs;
+        uint32_t        zero = 0;
+        b1Coeffs = vshlcq_s32(b1Coeffs, &zero, 32);
+
+        a0Coeffs[2] = pCoeffs[4];
+        a0Coeffs[3] = pCoeffs[3];
+        a1Coeffs = vrev64q_s32(a0Coeffs);
+
+
+        /*
+         * prologue consumes history samples
+         */
+
+        /* 2 first elements are garbage, will be updated with history */
+        inVec0 = vld1q(pIn - 2);
+
+        inVec0[0] = pState[1];
+        inVec0[1] = pState[0];
+
+        stateVec[2] = pState[3];
+        stateVec[3] = pState[2];
+
+        acc = vrmlaldavhq(b0Coeffs, inVec0);
+        acc = vrmlaldavhaq(acc, a0Coeffs, stateVec);
+        acc = lsll(acc, shift);
+        out = (q31_t) ((acc >> 32) & 0xffffffff);
+
+        stateVec[2] = out;
+        acc = vrmlaldavhq(b1Coeffs, inVec0);
+        acc = vrmlaldavhaq(acc, a1Coeffs, stateVec);
+
+        acc = lsll(acc, shift);
+        out1 = (q31_t) ((acc >> 32) & 0xffffffff);
+
+
+        inVec0 = vld1q(pIn);
+        pIn += 2;
+
+        /*
+         * main loop
+         */
+        uint32_t            sample = (blockSize - 2) >> 2U;
+        /*
+         * First part of the processing with loop unrolling.
+         * Compute 4 outputs at a time.
+         */
+        while (sample > 0U) {
+
+            stateVec[3] = out1;
+
+            *pOut++ = out;
+            *pOut++ = out1;
+
+            /*
+             * in         { x0  x1  x2  x3 }
+             *                    x
+             * b0Coeffs   { b2  b1  b0  0  }
+             */
+            acc = vrmlaldavhq(b0Coeffs, inVec0);
+            /*
+             * out         { 0   0   yn2 yn1 }
+             *                    x
+             * a0Coeffs    { 0   0   a2  a1  }
+             */
+            acc = vrmlaldavhaq(acc, a0Coeffs, stateVec);
+            acc = lsll(acc, shift);
+            out = (q31_t) ((acc >> 32) & 0xffffffff);
+
+            stateVec[2] = out;
+
+            /*
+             * in         { x0  x1  x2  x3 }
+             *                    x
+             * b0Coeffs   {  0  b2  b1  b0 }
+             */
+            acc = vrmlaldavhq(b1Coeffs, inVec0);
+            /*
+             * out         { 0   0   y0  yn1 }
+             *                    x
+             * a0Coeffs    { 0   0   a1  a2  }
+             */
+            acc = vrmlaldavhaq(acc, a1Coeffs, stateVec);
+            acc = lsll(acc, shift);
+            out1 = (q31_t) ((acc >> 32) & 0xffffffff);
+
+            stateVec[3] = out1;
+
+            inVec0 = vld1q(pIn);
+            pIn += 2;
+
+            /* unrolled part */
+            *pOut++ = out;
+            *pOut++ = out1;
+
+            acc = vrmlaldavhq(b0Coeffs, inVec0);
+            acc = vrmlaldavhaq(acc, a0Coeffs, stateVec);
+            acc = lsll(acc, shift);
+            out = (q31_t) ((acc >> 32) & 0xffffffff);
+
+            stateVec[2] = out;
+
+            acc = vrmlaldavhq(b1Coeffs, inVec0);
+            acc = vrmlaldavhaq(acc, a1Coeffs, stateVec);
+            acc = lsll(acc, shift);
+            out1 = (q31_t) ((acc >> 32) & 0xffffffff);
+
+            inVec0 = vld1q(pIn);
+            pIn += 2;
+
+            sample--;
+        }
+
+        *pOut++ = out;
+        *pOut++ = out1;
+
+        /*
+         * Tail handling
+         */
+        int32_t         loopRemainder = blockSize & 3;
+        if (loopRemainder == 2) {
+            /*
+             * Store the updated state variables back into the pState array
+             */
+            pState[0] = inVec0[1];
+            pState[1] = inVec0[0];
+            pState[3] = out;
+            pState[2] = out1;
+        } else if (loopRemainder == 1) {
+            stateVec[3] = out1;
+
+            acc = vrmlaldavhq(b0Coeffs, inVec0);
+            acc = vrmlaldavhaq(acc, a0Coeffs, stateVec);
+            acc = lsll(acc, shift);
+            out = (q31_t) ((acc >> 32) & 0xffffffff);
+
+            stateVec[2] = out;
+
+            acc = vrmlaldavhq(b1Coeffs, inVec0);
+            acc = vrmlaldavhaq(acc, a1Coeffs, stateVec);
+            acc = lsll(acc, shift);
+            out1 = (q31_t) ((acc >> 32) & 0xffffffff);
+
+            stateVec[3] = out1;
+
+            inVec0 = vld1q(pIn);
+            pIn += 2;
+
+            *pOut++ = out;
+            *pOut++ = out1;
+
+            acc = vrmlaldavhq(b0Coeffs, inVec0);
+            acc = vrmlaldavhaq(acc, a0Coeffs, stateVec);
+            acc = lsll(acc, shift);
+            out = (q31_t) ((acc >> 32) & 0xffffffff);
+
+            *pOut++ = out;
+
+            /*
+             * Store the updated state variables back into the pState array
+             */
+            pState[0] = inVec0[2];
+            pState[1] = inVec0[1];
+            pState[3] = out1;
+            pState[2] = out;
+        } else if (loopRemainder == 0) {
+            stateVec[3] = out1;
+
+            acc = vrmlaldavhq(b0Coeffs, inVec0);
+            acc = vrmlaldavhaq(acc, a0Coeffs, stateVec);
+            acc = lsll(acc, shift);
+            out = (q31_t) ((acc >> 32) & 0xffffffff);
+
+            stateVec[2] = out;
+
+            acc = vrmlaldavhq(b1Coeffs, inVec0);
+            acc = vrmlaldavhaq(acc, a1Coeffs, stateVec);
+            acc = lsll(acc, shift);
+            out1 = (q31_t) ((acc >> 32) & 0xffffffff);
+
+            *pOut++ = out;
+            *pOut++ = out1;
+
+            /*
+             * Store the updated state variables back into the pState array
+             */
+            pState[0] = inVec0[3];
+            pState[1] = inVec0[2];
+            pState[3] = out;
+            pState[2] = out1;
+        } else {
+            stateVec[3] = out1;
+
+            acc = vrmlaldavhq(b0Coeffs, inVec0);
+            acc = vrmlaldavhaq(acc, a0Coeffs, stateVec);
+            acc = lsll(acc, shift);
+            out = (q31_t) ((acc >> 32) & 0xffffffff);
+
+            *pOut++ = out;
+
+            /*
+             * Store the updated state variables back into the pState array
+             */
+            pState[0] = inVec0[2];
+            pState[1] = inVec0[1];
+            pState[3] = out1;
+            pState[2] = out;
+        }
+
+
+        pCoeffs += 5;
+        pState += 4;
+
+        /*  The first stage goes from the input buffer to the output buffer. */
+        /*  Subsequent stages occur in-place in the output buffer */
+        pIn = pDst;
+
+        /* Reset to destination pointer */
+        pOut = pDst;
+    }
+    while (--stages);
+}
+#else
+void arm_biquad_cascade_df1_q31(
+  const arm_biquad_casd_df1_inst_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+  const q31_t *pIn = pSrc;                             /* Source pointer */
+        q31_t *pOut = pDst;                            /* Destination pointer */
+        q31_t *pState = S->pState;                     /* pState pointer */
+  const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        q63_t acc;                                     /* Accumulator */
+        q31_t b0, b1, b2, a1, a2;                      /* Filter coefficients */
+        q31_t Xn1, Xn2, Yn1, Yn2;                      /* Filter pState variables */
+        q31_t Xn;                                      /* Temporary input */
+        uint32_t uShift = ((uint32_t) S->postShift + 1U);
+        uint32_t lShift = 32U - uShift;                /* Shift to be applied to the output */
+        uint32_t sample, stage = S->numStages;         /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+        q31_t acc_l, acc_h;                            /* temporary output variables */
+#endif
+
+  do
+  {
+    /* Reading the coefficients */
+    b0 = *pCoeffs++;
+    b1 = *pCoeffs++;
+    b2 = *pCoeffs++;
+    a1 = *pCoeffs++;
+    a2 = *pCoeffs++;
+
+    /* Reading the pState values */
+    Xn1 = pState[0];
+    Xn2 = pState[1];
+    Yn1 = pState[2];
+    Yn2 = pState[3];
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+    /* Apply loop unrolling and compute 4 output values simultaneously. */
+    /* Variable acc hold output values that are being computed:
+     *
+     * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+     */
+
+    /* Loop unrolling: Compute 4 outputs at a time */
+    sample = blockSize >> 2U;
+
+    while (sample > 0U)
+    {
+      /* Read the first input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      acc = ((q63_t) b0 * Xn) + ((q63_t) b1 * Xn1) + ((q63_t) b2 * Xn2) + ((q63_t) a1 * Yn1) + ((q63_t) a2 * Yn2);
+
+      /* The result is converted to 1.31 , Yn2 variable is reused */
+      acc_l = (acc      ) & 0xffffffff; /* Calc lower part of acc */
+      acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
+
+      /* Apply shift for lower part of acc and upper part of acc */
+      Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+      /* Store output in destination buffer. */
+      *pOut++ = Yn2;
+
+      /* Read the second input */
+      Xn2 = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      acc = ((q63_t) b0 * Xn2) + ((q63_t) b1 * Xn) + ((q63_t) b2 * Xn1) + ((q63_t) a1 * Yn2) + ((q63_t) a2 * Yn1);
+
+      /* The result is converted to 1.31, Yn1 variable is reused  */
+      acc_l = (acc      ) & 0xffffffff; /* Calc lower part of acc */
+      acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
+
+      /* Apply shift for lower part of acc and upper part of acc */
+      Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+      /* Store output in destination buffer. */
+      *pOut++ = Yn1;
+
+      /* Read the third input */
+      Xn1 = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      acc = ((q63_t) b0 * Xn1) + ((q63_t) b1 * Xn2) + ((q63_t) b2 * Xn) + ((q63_t) a1 * Yn1) + ((q63_t) a2 * Yn2);
+
+      /* The result is converted to 1.31, Yn2 variable is reused  */
+      acc_l = (acc      ) & 0xffffffff; /* Calc lower part of acc */
+      acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
+
+      /* Apply shift for lower part of acc and upper part of acc */
+      Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+      /* Store output in destination buffer. */
+      *pOut++ = Yn2;
+
+      /* Read the forth input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      acc = ((q63_t) b0 * Xn) + ((q63_t) b1 * Xn1) + ((q63_t) b2 * Xn2) + ((q63_t) a1 * Yn2) + ((q63_t) a2 * Yn1);
+
+      /* The result is converted to 1.31, Yn1 variable is reused  */
+      acc_l = (acc      ) & 0xffffffff; /* Calc lower part of acc */
+      acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
+
+      /* Apply shift for lower part of acc and upper part of acc */
+      Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+      /* Store output in destination buffer. */
+      *pOut++ = Yn1;
+
+      /* Every time after the output is computed state should be updated. */
+      /* The states should be updated as: */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
+      Xn2 = Xn1;
+      Xn1 = Xn;
+
+      /* decrement loop counter */
+      sample--;
+    }
+
+    /* Loop unrolling: Compute remaining outputs */
+    sample = blockSize & 0x3U;
+
+#else
+
+    /* Initialize blkCnt with number of samples */
+    sample = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+    while (sample > 0U)
+    {
+      /* Read the input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      acc = ((q63_t) b0 * Xn) + ((q63_t) b1 * Xn1) + ((q63_t) b2 * Xn2) + ((q63_t) a1 * Yn1) + ((q63_t) a2 * Yn2);
+
+      /* The result is converted to 1.31  */
+      acc = acc >> lShift;
+
+      /* Store output in destination buffer. */
+      *pOut++ = (q31_t) acc;
+
+      /* Every time after the output is computed state should be updated. */
+      /* The states should be updated as: */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
+      Xn2 = Xn1;
+      Xn1 = Xn;
+      Yn2 = Yn1;
+      Yn1 = (q31_t) acc;
+
+      /* decrement loop counter */
+      sample--;
+    }
+
+    /* Store the updated state variables back into the pState array */
+    *pState++ = Xn1;
+    *pState++ = Xn2;
+    *pState++ = Yn1;
+    *pState++ = Yn2;
+
+    /* The first stage goes from the input buffer to the output buffer. */
+    /* Subsequent numStages occur in-place in the output buffer */
+    pIn = pDst;
+
+    /* Reset output pointer */
+    pOut = pDst;
+
+    /* decrement loop counter */
+    stage--;
+
+  } while (stage > 0U);
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BiquadCascadeDF1 group
+ */
--- a/src/CMSIS/arm_common_tables.c
+++ b/src/CMSIS/arm_common_tables.c
--- a/src/CMSIS/arm_fir_f32.c
+++ b/src/CMSIS/arm_fir_f32.c
--- a/src/CMSIS/arm_fir_fast_q15.c
+++ b/src/CMSIS/arm_fir_fast_q15.c
@ -0,0 +1,332 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_fir_fast_q15.c
+ * Description:  Q15 Fast FIR filter processing function
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+  @ingroup groupFilters
+ */
+
+/**
+  @addtogroup FIR
+  @{
+ */
+
+/**
+  @brief         Processing function for the Q15 FIR filter (fast version).
+  @param[in]     S          points to an instance of the Q15 FIR filter structure
+  @param[in]     pSrc       points to the block of input data
+  @param[out]    pDst       points to the block of output data
+  @param[in]     blockSize  number of samples to process
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   This fast version uses a 32-bit accumulator with 2.30 format.
+                   The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+                   Thus, if the accumulator result overflows it wraps around and distorts the result.
+                   In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
+                   The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
+
+  @remark
+                   Refer to \ref arm_fir_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.  Both the slow and the fast versions use the same instance structure.
+                   Use function \ref arm_fir_init_q15() to initialize the filter structure.
+ */
+
+void arm_fir_fast_q15(
+  const arm_fir_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        q15_t *pState = S->pState;                     /* State pointer */
+  const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        q15_t *pStateCurnt;                            /* Points to the current sample of the state */
+        q15_t *px;                                     /* Temporary pointer for state buffer */
+  const q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
+        q31_t acc0;                                    /* Accumulators */
+        uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+        uint32_t tapCnt, blkCnt;                       /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+        q31_t acc1, acc2, acc3;                        /* Accumulators */
+        q31_t x0, x1, x2, c0;                          /* Temporary variables to hold state and coefficient values */
+#endif
+
+  /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 output values simultaneously.
+   * The variables acc0 ... acc3 hold output values that are being computed:
+   *
+   *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+   *    acc1 =  b[numTaps-1] * x[n-numTaps]   + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+   *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps]   + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+   *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
+   */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* Copy 4 new input samples into the state buffer. */
+    *pStateCurnt++ = *pSrc++;
+    *pStateCurnt++ = *pSrc++;
+    *pStateCurnt++ = *pSrc++;
+    *pStateCurnt++ = *pSrc++;
+
+    /* Set all accumulators to zero */
+    acc0 = 0;
+    acc1 = 0;
+    acc2 = 0;
+    acc3 = 0;
+
+    /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */
+    px = pState;
+
+    /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */
+    pb = pCoeffs;
+
+    /* Read the first two samples from the state buffer:  x[n-N], x[n-N-1] */
+    x0 = read_q15x2_ia (&px);
+
+    /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */
+    x2 = read_q15x2_ia (&px);
+
+    /* Loop over the number of taps.  Unroll by a factor of 4.
+       Repeat until we've computed numTaps-(numTaps%4) coefficients. */
+    tapCnt = numTaps >> 2U;
+
+    while (tapCnt > 0U)
+    {
+      /* Read the first two coefficients using SIMD:  b[N] and b[N-1] coefficients */
+      c0 = read_q15x2_ia ((q15_t **) &pb);
+
+      /* acc0 +=  b[N] * x[n-N] + b[N-1] * x[n-N-1] */
+      acc0 = __SMLAD(x0, c0, acc0);
+
+      /* acc2 +=  b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
+      acc2 = __SMLAD(x2, c0, acc2);
+
+      /* pack  x[n-N-1] and x[n-N-2] */
+#ifndef ARM_MATH_BIG_ENDIAN
+      x1 = __PKHBT(x2, x0, 0);
+#else
+      x1 = __PKHBT(x0, x2, 0);
+#endif
+
+      /* Read state x[n-N-4], x[n-N-5] */
+      x0 = read_q15x2_ia (&px);
+
+      /* acc1 +=  b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
+      acc1 = __SMLADX(x1, c0, acc1);
+
+      /* pack  x[n-N-3] and x[n-N-4] */
+#ifndef ARM_MATH_BIG_ENDIAN
+      x1 = __PKHBT(x0, x2, 0);
+#else
+      x1 = __PKHBT(x2, x0, 0);
+#endif
+
+      /* acc3 +=  b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
+      acc3 = __SMLADX(x1, c0, acc3);
+
+      /* Read coefficients b[N-2], b[N-3] */
+      c0 = read_q15x2_ia ((q15_t **) &pb);
+
+      /* acc0 +=  b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
+      acc0 = __SMLAD(x2, c0, acc0);
+
+      /* Read state x[n-N-6], x[n-N-7] with offset */
+      x2 = read_q15x2_ia (&px);
+
+      /* acc2 +=  b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
+      acc2 = __SMLAD(x0, c0, acc2);
+
+      /* acc1 +=  b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
+      acc1 = __SMLADX(x1, c0, acc1);
+
+      /* pack  x[n-N-5] and x[n-N-6] */
+#ifndef ARM_MATH_BIG_ENDIAN
+      x1 = __PKHBT(x2, x0, 0);
+#else
+      x1 = __PKHBT(x0, x2, 0);
+#endif
+
+      /* acc3 +=  b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
+      acc3 = __SMLADX(x1, c0, acc3);
+
+      /* Decrement tap count */
+      tapCnt--;
+    }
+
+    /* If the filter length is not a multiple of 4, compute the remaining filter taps.
+       This is always be 2 taps since the filter length is even. */
+    if ((numTaps & 0x3U) != 0U)
+    {
+      /* Read last two coefficients */
+      c0 = read_q15x2_ia ((q15_t **) &pb);
+
+      /* Perform the multiply-accumulates */
+      acc0 = __SMLAD(x0, c0, acc0);
+      acc2 = __SMLAD(x2, c0, acc2);
+
+      /* pack state variables */
+#ifndef ARM_MATH_BIG_ENDIAN
+      x1 = __PKHBT(x2, x0, 0);
+#else
+      x1 = __PKHBT(x0, x2, 0);
+#endif
+
+      /* Read last state variables */
+      x0 = read_q15x2 (px);
+
+      /* Perform the multiply-accumulates */
+      acc1 = __SMLADX(x1, c0, acc1);
+
+      /* pack state variables */
+#ifndef ARM_MATH_BIG_ENDIAN
+      x1 = __PKHBT(x0, x2, 0);
+#else
+      x1 = __PKHBT(x2, x0, 0);
+#endif
+
+      /* Perform the multiply-accumulates */
+      acc3 = __SMLADX(x1, c0, acc3);
+    }
+
+    /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
+       Then store the 4 outputs in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+    write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
+    write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
+#else
+    write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
+    write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+    /* Advance the state pointer by 4 to process the next group of 4 samples */
+    pState = pState + 4U;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining output samples */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of taps */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* Copy two samples into state buffer */
+    *pStateCurnt++ = *pSrc++;
+
+    /* Set the accumulator to zero */
+    acc0 = 0;
+
+    /* Use SIMD to hold states and coefficients */
+    px = pState;
+    pb = pCoeffs;
+
+    tapCnt = numTaps >> 1U;
+
+    do
+    {
+      acc0 += (q31_t) *px++ * *pb++;
+	  acc0 += (q31_t) *px++ * *pb++;
+
+      tapCnt--;
+    }
+    while (tapCnt > 0U);
+
+    /* The result is in 2.30 format. Convert to 1.15 with saturation.
+       Then store the output in the destination buffer. */
+    *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+
+    /* Advance state pointer by 1 for the next sample */
+    pState = pState + 1U;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.
+     Now copy the last numTaps - 1 samples to the start of the state buffer.
+     This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 taps at a time */
+  tapCnt = (numTaps - 1U) >> 2U;
+
+  /* Copy data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+    *pStateCurnt++ = *pState++;
+    *pStateCurnt++ = *pState++;
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement loop counter */
+    tapCnt--;
+  }
+
+  /* Calculate remaining number of copies */
+  tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+  /* Initialize tapCnt with number of taps */
+  tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  /* Copy remaining data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement loop counter */
+    tapCnt--;
+  }
+
+}
+
+/**
+  @} end of FIR group
+ */
--- a/src/CMSIS/arm_fir_interpolate_q15.c
+++ b/src/CMSIS/arm_fir_interpolate_q15.c
@ -0,0 +1,774 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_fir_interpolate_q15.c
+ * Description:  Q15 FIR interpolation
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+  @ingroup groupFilters
+ */
+
+/**
+  @addtogroup FIR_Interpolate
+  @{
+ */
+
+/**
+  @brief         Processing function for the Q15 FIR interpolator.
+  @param[in]     S          points to an instance of the Q15 FIR interpolator structure
+  @param[in]     pSrc       points to the block of input data
+  @param[out]    pDst       points to the block of output data
+  @param[in]     blockSize  number of samples to process
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function is implemented using a 64-bit internal accumulator.
+                   Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+                   The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+                   There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+                   After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
+                   Lastly, the accumulator is saturated to yield a result in 1.15 format.
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+__STATIC_INLINE void arm_fir_interpolate_q15(
+  const arm_fir_interpolate_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+    q15_t    *pState = S->pState;   /* State pointer */
+    const q15_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+    q15_t    *pStateCurnt;      /* Points to the current sample of the state */
+    const q15_t    *ptr1, *ptr2;      /* Temporary pointers for state and coefficient buffers */
+
+    uint32_t  i, blkCnt;        /* Loop counters */
+    uint16_t  phaseLen = S->phaseLength;    /* Length of each polyphase filter component */
+    uint16_t  strides[8] = {
+        0, 1 * S->L, 2 * S->L, 3 * S->L,
+        4 * S->L, 5 * S->L, 6 * S->L, 7 * S->L
+        };
+    uint16x8_t vec_strides0 = *(uint16x8_t *) strides;
+    uint16x8_t vec_strides1 = vec_strides0 + 1;
+    uint16x8_t vec_strides2 = vec_strides0 + 2;
+    uint16x8_t vec_strides3 = vec_strides0 + 3;
+    q15x8_t vecState, vecCoef;
+
+    /*
+     * S->pState buffer contains previous frame (phaseLen - 1) samples
+     * pStateCurnt points to the location where the new input data should be written
+     */
+    pStateCurnt = S->pState + ((q15_t) phaseLen - 1);
+    /*
+     * Total number of intput samples
+     */
+    blkCnt = blockSize;
+    /*
+     * Loop over the blockSize.
+     */
+    while (blkCnt > 0U)
+    {
+        /*
+         * Copy new input sample into the state buffer
+         */
+        *pStateCurnt++ = *pSrc++;
+        /*
+         * Loop over the Interpolation factor.
+         */
+        i = S->L;
+        while (i > 0U)
+        {
+            /*
+             * Initialize state pointer
+             */
+            ptr1 = pState;
+            if (i >= 4)
+            {
+                /*
+                 * Initialize coefficient pointer
+                 */
+                ptr2 = pCoeffs + (i - 1 - 3U);
+
+                q63_t     acc0 = 0LL;
+                q63_t     acc1 = 0LL;
+                q63_t     acc2 = 0LL;
+                q63_t     acc3 = 0LL;
+
+                uint32_t  tapCnt = phaseLen >> 3;
+                while (tapCnt > 0U)
+                {
+                    vecState = vldrhq_s16(ptr1);
+
+                    vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides3);
+                    acc0 = vmlaldavaq(acc0, vecState, vecCoef);
+
+                    vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides2);
+                    acc1 = vmlaldavaq(acc1, vecState, vecCoef);
+
+                    vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides1);
+                    acc2 = vmlaldavaq(acc2, vecState, vecCoef);
+
+                    vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides0);
+                    acc3 = vmlaldavaq(acc3, vecState, vecCoef);
+
+                    ptr1 += 8;
+                    ptr2 = ptr2 + S->L * 8;
+                    tapCnt--;
+                }
+                tapCnt = phaseLen & 7;
+                if (tapCnt > 0U)
+                {
+                    mve_pred16_t p0 = vctp16q(tapCnt);
+
+                    vecState = vldrhq_z_s16(ptr1, p0);
+
+                    vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides3, p0);
+                    acc0 = vmlaldavaq(acc0, vecState, vecCoef);
+
+                    vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides2, p0);
+                    acc1 = vmlaldavaq(acc1, vecState, vecCoef);
+
+                    vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides1, p0);
+                    acc2 = vmlaldavaq(acc2, vecState, vecCoef);
+
+                    vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides0, p0);
+                    acc3 = vmlaldavaq(acc3, vecState, vecCoef);
+                }
+
+                acc0 = asrl(acc0, 15);
+                acc1 = asrl(acc1, 15);
+                acc2 = asrl(acc2, 15);
+                acc3 = asrl(acc3, 15);
+
+                *pDst++ = (q15_t) __SSAT(acc0, 16);
+                *pDst++ = (q15_t) __SSAT(acc1, 16);
+                *pDst++ = (q15_t) __SSAT(acc2, 16);
+                *pDst++ = (q15_t) __SSAT(acc3, 16);
+                i -= 4;
+            }
+            else if (i >= 3)
+            {
+                /*
+                 * Initialize coefficient pointer
+                 */
+                ptr2 = pCoeffs + (i - 1U - 2);
+
+                q63_t     acc0 = 0LL;
+                q63_t     acc1 = 0LL;
+                q63_t     acc2 = 0LL;
+
+                uint32_t  tapCnt = phaseLen >> 3;
+                while (tapCnt > 0U)
+                {
+                    vecState = vldrhq_s16(ptr1);
+
+                    vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides2);
+                    acc0 = vmlaldavaq(acc0, vecState, vecCoef);
+
+                    vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides1);
+                    acc1 = vmlaldavaq(acc1, vecState, vecCoef);
+
+                    vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides0);
+                    acc2 = vmlaldavaq(acc2, vecState, vecCoef);
+
+                    ptr1 += 8;
+                    ptr2 = ptr2 + S->L * 8;
+                    tapCnt--;
+                }
+                tapCnt = phaseLen & 7;
+                if (tapCnt > 0U)
+                {
+                    mve_pred16_t p0 = vctp16q(tapCnt);
+
+                    vecState = vldrhq_z_s16(ptr1, p0);
+
+                    vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides2, p0);
+                    acc0 = vmlaldavaq(acc0, vecState, vecCoef);
+
+                    vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides1, p0);
+                    acc1 = vmlaldavaq(acc1, vecState, vecCoef);
+
+                    vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides0, p0);
+                    acc2 = vmlaldavaq(acc2, vecState, vecCoef);
+                }
+
+                acc0 = asrl(acc0, 15);
+                acc1 = asrl(acc1, 15);
+                acc2 = asrl(acc2, 15);
+
+                *pDst++ = (q15_t) __SSAT(acc0, 16);;
+                *pDst++ = (q15_t) __SSAT(acc1, 16);;
+                *pDst++ = (q15_t) __SSAT(acc2, 16);;
+                i -= 3;
+            }
+            else if (i >= 2)
+            {
+                /*
+                 * Initialize coefficient pointer
+                 */
+                ptr2 = pCoeffs + (i - 1U - 1);
+
+                q63_t     acc0 = 0LL;
+                q63_t     acc1 = 0LL;
+
+                uint32_t  tapCnt = phaseLen >> 3;
+                while (tapCnt > 0U)
+                {
+                    vecState = vldrhq_s16(ptr1);
+
+                    vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides1);
+                    acc0 = vmlaldavaq(acc0, vecState, vecCoef);
+
+                    vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides0);
+                    acc1 = vmlaldavaq(acc1, vecState, vecCoef);
+
+                    ptr1 += 8;
+                    ptr2 = ptr2 + S->L * 8;
+                    tapCnt--;
+                }
+                tapCnt = phaseLen & 7;
+                if (tapCnt > 0U)
+                {
+                    mve_pred16_t p0 = vctp16q(tapCnt);
+
+                    vecState = vldrhq_z_s16(ptr1, p0);
+
+                    vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides1, p0);
+                    acc0 = vmlaldavaq(acc0, vecState, vecCoef);
+
+                    vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides0, p0);
+                    acc1 = vmlaldavaq(acc1, vecState, vecCoef);
+                }
+
+                acc0 = asrl(acc0, 15);
+                acc1 = asrl(acc1, 15);
+
+                *pDst++ = (q15_t) __SSAT(acc0, 16);
+                *pDst++ = (q15_t) __SSAT(acc1, 16);
+                i -= 2;
+            }
+            else
+            {
+                /*
+                 * Initialize coefficient pointer
+                 */
+                ptr2 = pCoeffs + (i - 1U);
+
+                q63_t     acc0 = 0LL;
+
+                uint32_t  tapCnt = phaseLen >> 3;
+                while (tapCnt > 0U)
+                {
+                    vecState = vldrhq_s16(ptr1);
+                    vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides0);
+
+                    acc0 = vmlaldavaq(acc0, vecState, vecCoef);
+
+                    ptr1 += 8;
+                    ptr2 = ptr2 + S->L * 8;
+                    tapCnt--;
+                }
+                tapCnt = phaseLen & 7;
+                if (tapCnt > 0U)
+                {
+                    mve_pred16_t p0 = vctp16q(tapCnt);
+
+                    vecState = vldrhq_z_s16(ptr1, p0);
+                    vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides0, p0);
+                    acc0 = vmlaldavaq(acc0, vecState, vecCoef);
+                }
+
+                acc0 = asrl(acc0, 15);
+                *pDst++ = (q15_t) __SSAT(acc0, 16);
+                /*
+                 * Decrement the loop counter
+                 */
+                i--;
+            }
+        }
+        /*
+         * Advance the state pointer by 1
+         * * to process the next group of interpolation factor number samples
+         */
+        pState = pState + 1;
+        /*
+         * Decrement the loop counter
+         */
+        blkCnt--;
+    }
+
+    /*
+     * Processing is complete.
+     * Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+     * This prepares the state buffer for the next function call.
+     */
+
+    /*
+     * Points to the start of the state buffer
+     */
+    pStateCurnt = S->pState;
+    blkCnt = (phaseLen - 1U) >> 3;
+    while (blkCnt > 0U)
+    {
+        vstrhq_s16(pStateCurnt, vldrhq_s16(pState));
+        pState += 8;
+        pStateCurnt += 8;
+        blkCnt--;
+    }
+    blkCnt = (phaseLen - 1U) & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vstrhq_p_s16(pStateCurnt, vldrhq_s16(pState), p0);
+    }
+}
+#else
+void arm_fir_interpolate_q15(
+  const arm_fir_interpolate_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
+
+        q15_t *pState = S->pState;                     /* State pointer */
+  const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        q15_t *pStateCur;                              /* Points to the current sample of the state */
+        q15_t *ptr1;                                   /* Temporary pointer for state buffer */
+  const q15_t *ptr2;                                   /* Temporary pointer for coefficient buffer */
+        q63_t sum0;                                    /* Accumulators */
+        uint32_t i, blkCnt, tapCnt;                    /* Loop counters */
+        uint32_t phaseLen = S->phaseLength;            /* Length of each polyphase filter component */
+        uint32_t j;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+        q63_t acc0, acc1, acc2, acc3;
+        q15_t x0, x1, x2, x3;
+        q15_t c0, c1, c2, c3;
+#endif
+
+  /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+  /* pStateCur points to the location where the new input data should be written */
+  pStateCur = S->pState + (phaseLen - 1U);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* Copy new input sample into the state buffer */
+    *pStateCur++ = *pSrc++;
+    *pStateCur++ = *pSrc++;
+    *pStateCur++ = *pSrc++;
+    *pStateCur++ = *pSrc++;
+
+    /* Address modifier index of coefficient buffer */
+    j = 1U;
+
+    /* Loop over the Interpolation factor. */
+    i = (S->L);
+
+    while (i > 0U)
+    {
+      /* Set accumulator to zero */
+      acc0 = 0;
+      acc1 = 0;
+      acc2 = 0;
+      acc3 = 0;
+
+      /* Initialize state pointer */
+      ptr1 = pState;
+
+      /* Initialize coefficient pointer */
+      ptr2 = pCoeffs + (S->L - j);
+
+      /* Loop over the polyPhase length. Unroll by a factor of 4.
+         Repeat until we've computed numTaps-(4*S->L) coefficients. */
+      tapCnt = phaseLen >> 2U;
+
+      x0 = *(ptr1++);
+      x1 = *(ptr1++);
+      x2 = *(ptr1++);
+
+      while (tapCnt > 0U)
+      {
+        /* Read the input sample */
+        x3 = *(ptr1++);
+
+        /* Read the coefficient */
+        c0 = *(ptr2);
+
+        /* Perform the multiply-accumulate */
+        acc0 += (q63_t) x0 * c0;
+        acc1 += (q63_t) x1 * c0;
+        acc2 += (q63_t) x2 * c0;
+        acc3 += (q63_t) x3 * c0;
+
+        /* Read the coefficient */
+        c1 = *(ptr2 + S->L);
+
+        /* Read the input sample */
+        x0 = *(ptr1++);
+
+        /* Perform the multiply-accumulate */
+        acc0 += (q63_t) x1 * c1;
+        acc1 += (q63_t) x2 * c1;
+        acc2 += (q63_t) x3 * c1;
+        acc3 += (q63_t) x0 * c1;
+
+        /* Read the coefficient */
+        c2 = *(ptr2 + S->L * 2);
+
+        /* Read the input sample */
+        x1 = *(ptr1++);
+
+        /* Perform the multiply-accumulate */
+        acc0 += (q63_t) x2 * c2;
+        acc1 += (q63_t) x3 * c2;
+        acc2 += (q63_t) x0 * c2;
+        acc3 += (q63_t) x1 * c2;
+
+        /* Read the coefficient */
+        c3 = *(ptr2 + S->L * 3);
+
+        /* Read the input sample */
+        x2 = *(ptr1++);
+
+        /* Perform the multiply-accumulate */
+        acc0 += (q63_t) x3 * c3;
+        acc1 += (q63_t) x0 * c3;
+        acc2 += (q63_t) x1 * c3;
+        acc3 += (q63_t) x2 * c3;
+
+
+        /* Upsampling is done by stuffing L-1 zeros between each sample.
+         * So instead of multiplying zeros with coefficients,
+         * Increment the coefficient pointer by interpolation factor times. */
+        ptr2 += 4 * S->L;
+
+        /* Decrement loop counter */
+        tapCnt--;
+      }
+
+      /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+      tapCnt = phaseLen % 0x4U;
+
+      while (tapCnt > 0U)
+      {
+        /* Read the input sample */
+        x3 = *(ptr1++);
+
+        /* Read the coefficient */
+        c0 = *(ptr2);
+
+        /* Perform the multiply-accumulate */
+        acc0 += (q63_t) x0 * c0;
+        acc1 += (q63_t) x1 * c0;
+        acc2 += (q63_t) x2 * c0;
+        acc3 += (q63_t) x3 * c0;
+
+        /* Increment the coefficient pointer by interpolation factor times. */
+        ptr2 += S->L;
+
+        /* update states for next sample processing */
+        x0 = x1;
+        x1 = x2;
+        x2 = x3;
+
+        /* Decrement loop counter */
+        tapCnt--;
+      }
+
+      /* The result is in the accumulator, store in the destination buffer. */
+      *(pDst           ) = (q15_t) (__SSAT((acc0 >> 15), 16));
+      *(pDst +     S->L) = (q15_t) (__SSAT((acc1 >> 15), 16));
+      *(pDst + 2 * S->L) = (q15_t) (__SSAT((acc2 >> 15), 16));
+      *(pDst + 3 * S->L) = (q15_t) (__SSAT((acc3 >> 15), 16));
+
+      pDst++;
+
+      /* Increment the address modifier index of coefficient buffer */
+      j++;
+
+      /* Decrement loop counter */
+      i--;
+    }
+
+    /* Advance the state pointer by 1
+     * to process the next group of interpolation factor number samples */
+    pState = pState + 4;
+
+    pDst += S->L * 3;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* Copy new input sample into the state buffer */
+    *pStateCur++ = *pSrc++;
+
+    /* Address modifier index of coefficient buffer */
+    j = 1U;
+
+    /* Loop over the Interpolation factor. */
+    i = S->L;
+    while (i > 0U)
+    {
+      /* Set accumulator to zero */
+      sum0 = 0;
+
+      /* Initialize state pointer */
+      ptr1 = pState;
+
+      /* Initialize coefficient pointer */
+      ptr2 = pCoeffs + (S->L - j);
+
+      /* Loop over the polyPhase length.
+         Repeat until we've computed numTaps-(4*S->L) coefficients. */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+     /* Loop unrolling: Compute 4 outputs at a time */
+      tapCnt = phaseLen >> 2U;
+
+      while (tapCnt > 0U)
+      {
+        /* Perform the multiply-accumulate */
+        sum0 += (q63_t) *ptr1++ * *ptr2;
+
+        /* Upsampling is done by stuffing L-1 zeros between each sample.
+         * So instead of multiplying zeros with coefficients,
+         * Increment the coefficient pointer by interpolation factor times. */
+        ptr2 += S->L;
+
+        sum0 += (q63_t) *ptr1++ * *ptr2;
+        ptr2 += S->L;
+
+        sum0 += (q63_t) *ptr1++ * *ptr2;
+        ptr2 += S->L;
+
+        sum0 += (q63_t) *ptr1++ * *ptr2;
+        ptr2 += S->L;
+
+        /* Decrement loop counter */
+        tapCnt--;
+      }
+
+      /* Loop unrolling: Compute remaining outputs */
+      tapCnt = phaseLen % 0x4U;
+
+#else
+
+      /* Initialize tapCnt with number of samples */
+      tapCnt = phaseLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+      while (tapCnt > 0U)
+      {
+        /* Perform the multiply-accumulate */
+        sum0 += (q63_t) *ptr1++ * *ptr2;
+
+        /* Upsampling is done by stuffing L-1 zeros between each sample.
+         * So instead of multiplying zeros with coefficients,
+         * Increment the coefficient pointer by interpolation factor times. */
+        ptr2 += S->L;
+
+        /* Decrement loop counter */
+        tapCnt--;
+      }
+
+      /* The result is in the accumulator, store in the destination buffer. */
+      *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
+
+      /* Increment the address modifier index of coefficient buffer */
+      j++;
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Advance the state pointer by 1
+     * to process the next group of interpolation factor number samples */
+    pState = pState + 1;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.
+     Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+     This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCur = S->pState;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  tapCnt = (phaseLen - 1U) >> 2U;
+
+  /* copy data */
+  while (tapCnt > 0U)
+  {
+    write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
+    write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
+
+    /* Decrement loop counter */
+    tapCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  tapCnt = (phaseLen - 1U) % 0x04U;
+
+#else
+
+    /* Initialize tapCnt with number of samples */
+    tapCnt = (phaseLen - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  /* Copy data */
+  while (tapCnt > 0U)
+  {
+    *pStateCur++ = *pState++;
+
+    /* Decrement loop counter */
+    tapCnt--;
+  }
+
+#else
+/* alternate version for CM0_FAMILY */
+
+        q15_t *pState = S->pState;                     /* State pointer */
+  const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        q15_t *pStateCur;                              /* Points to the current sample of the state */
+        q15_t *ptr1;                                   /* Temporary pointer for state buffer */
+  const q15_t *ptr2;                                   /* Temporary pointer for coefficient buffer */
+        q63_t sum0;                                    /* Accumulators */
+        uint32_t i, blkCnt, tapCnt;                    /* Loop counters */
+        uint32_t phaseLen = S->phaseLength;            /* Length of each polyphase filter component */
+
+  /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+  /* pStateCur points to the location where the new input data should be written */
+  pStateCur = S->pState + (phaseLen - 1U);
+
+  /* Total number of intput samples */
+  blkCnt = blockSize;
+
+  /* Loop over the blockSize. */
+  while (blkCnt > 0U)
+  {
+    /* Copy new input sample into the state buffer */
+    *pStateCur++ = *pSrc++;
+
+    /* Loop over the Interpolation factor. */
+    i = S->L;
+
+    while (i > 0U)
+    {
+      /* Set accumulator to zero */
+      sum0 = 0;
+
+      /* Initialize state pointer */
+      ptr1 = pState;
+
+      /* Initialize coefficient pointer */
+      ptr2 = pCoeffs + (i - 1U);
+
+      /* Loop over the polyPhase length */
+      tapCnt = phaseLen;
+
+      while (tapCnt > 0U)
+      {
+        /* Perform the multiply-accumulate */
+        sum0 += ((q63_t) *ptr1++ * *ptr2);
+
+        /* Increment the coefficient pointer by interpolation factor times. */
+        ptr2 += S->L;
+
+        /* Decrement the loop counter */
+        tapCnt--;
+      }
+
+      /* Store the result after converting to 1.15 format in the destination buffer. */
+      *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
+
+      /* Decrement loop counter */
+      i--;
+    }
+
+    /* Advance the state pointer by 1
+     * to process the next group of interpolation factor number samples */
+    pState = pState + 1;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.
+   ** Now copy the last phaseLen - 1 samples to the start of the state buffer.
+   ** This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCur = S->pState;
+
+  tapCnt = phaseLen - 1U;
+
+  /* Copy data */
+  while (tapCnt > 0U)
+  {
+    *pStateCur++ = *pState++;
+
+    /* Decrement loop counter */
+    tapCnt--;
+  }
+
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
+}
+#endif /* defined(ARM_MATH_MVEI)*/
+/**
+  @} end of FIR_Interpolate group
+ */
--- a/src/CMSIS/arm_q15_to_q31.c
+++ b/src/CMSIS/arm_q15_to_q31.c
@ -0,0 +1,181 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_q15_to_q31.c
+ * Description:  Converts the elements of the Q15 vector to Q31 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup q15_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the Q15 vector to Q31 vector.
+  @param[in]     pSrc       points to the Q15 input vector
+  @param[out]    pDst       points to the Q31 output vector
+  @param[in]     blockSize  number of samples in each vector
+
+  @par           Details
+                   The equation used for the conversion process is:
+  <pre>
+      pDst[n] = (q31_t) pSrc[n] << 16;   0 <= n < blockSize.
+  </pre>
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+__STATIC_INLINE void arm_q15_to_q31(
+  const q15_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+
+  uint32_t blkCnt;
+
+  q31x4_t         vecDst;
+
+  blkCnt = blockSize>> 2;
+  while (blkCnt > 0U)
+  {
+
+        /* C = (q31_t)A << 16 */
+        /* convert from q15 to q31 and then store the results in the destination buffer */
+        /* load q15 + 32-bit widening */
+        vecDst = vldrhq_s32((q15_t const *) pSrc);
+        vecDst = vshlq_n(vecDst, 16);
+        vstrwq_s32(pDst, vecDst);
+
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pDst += 4;
+        pSrc += 4;
+        blkCnt --;
+    }
+
+  blkCnt = blockSize & 3;
+  while (blkCnt > 0U)
+  {
+    /* C = (q31_t) A << 16 */
+
+    /* Convert from q15 to q31 and store result in destination buffer */
+    *pDst++ = (q31_t) *pSrc++ << 16;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#else
+void arm_q15_to_q31(
+  const q15_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+  const q15_t *pIn = pSrc;                             /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+        q31_t in1, in2;
+        q31_t out1, out2, out3, out4;
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (q31_t)A << 16 */
+
+    /* Convert from q15 to q31 and store result in destination buffer */
+    in1 = read_q15x2_ia (&pIn);
+    in2 = read_q15x2_ia (&pIn);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+    /* extract lower 16 bits to 32 bit result */
+    out1 = in1 << 16U;
+    /* extract upper 16 bits to 32 bit result */
+    out2 = in1 & 0xFFFF0000;
+    /* extract lower 16 bits to 32 bit result */
+    out3 = in2 << 16U;
+    /* extract upper 16 bits to 32 bit result */
+    out4 = in2 & 0xFFFF0000;
+
+#else
+
+    /* extract upper 16 bits to 32 bit result */
+    out1 = in1 & 0xFFFF0000;
+    /* extract lower 16 bits to 32 bit result */
+    out2 = in1 << 16U;
+    /* extract upper 16 bits to 32 bit result */
+    out3 = in2 & 0xFFFF0000;
+    /* extract lower 16 bits to 32 bit result */
+    out4 = in2 << 16U;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+    *pDst++ = out1;
+    *pDst++ = out2;
+    *pDst++ = out3;
+    *pDst++ = out4;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = (q31_t) A << 16 */
+
+    /* Convert from q15 to q31 and store result in destination buffer */
+    *pDst++ = (q31_t) *pIn++ << 16;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of q15_to_x group
+ */
--- a/src/CMSIS/arm_sin_q31.c
+++ b/src/CMSIS/arm_sin_q31.c
@ -0,0 +1,82 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_sin_q31.c
+ * Description:  Fast sine calculation for Q31 values
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/fast_math_functions.h"
+#include "arm_common_tables.h"
+
+/**
+  @ingroup groupFastMath
+ */
+
+/**
+  @addtogroup sin
+  @{
+ */
+
+/**
+  @brief         Fast approximation to the trigonometric sine function for Q31 data.
+  @param[in]     x  Scaled input value in radians
+  @return        sin(x)
+
+  The Q31 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*PI).
+ */
+
+q31_t arm_sin_q31(
+  q31_t x)
+{
+  q31_t sinVal;                                  /* Temporary variables for input, output */
+  int32_t index;                                 /* Index variable */
+  q31_t a, b;                                    /* Two nearest output values */
+  q31_t fract;                                   /* Temporary values for fractional values */
+
+  if (x < 0)
+  { /* convert negative numbers to corresponding positive ones */
+    x = (uint32_t)x + 0x80000000;
+  }
+
+  /* Calculate the nearest index */
+  index = (uint32_t)x >> FAST_MATH_Q31_SHIFT;
+
+  /* Calculation of fractional value */
+  fract = (x - (index << FAST_MATH_Q31_SHIFT)) << 9;
+
+  /* Read two nearest values of input value from the sin table */
+  a = sinTable_q31[index];
+  b = sinTable_q31[index+1];
+
+  /* Linear interpolation process */
+  sinVal = (q63_t) (0x80000000 - fract) * a >> 32;
+  sinVal = (q31_t) ((((q63_t) sinVal << 32) + ((q63_t) fract * b)) >> 32);
+
+  /* Return output value */
+  return (sinVal << 1);
+}
+
+/**
+  @} end of sin group
+ */
--- a/src/IO.h
+++ b/src/IO.h
@ -58,10 +58,6 @@ public:
  void resetWatchdog();
  uint32_t getWatchdog();
  
-  uint8_t getCPU() const;
-
-  void getUDID(uint8_t* buffer);
-
  void selfTest();

 private:
--- a/src/SerialPort.cpp
+++ b/src/SerialPort.cpp
@ -28,6 +28,10 @@
 #include "Version.h"
 #include "IOPins.h"

+#if defined(I2C_REPEATER)
+#include "Wire.h"
+#endif
+
 const uint8_t MMDVM_FRAME_START  = 0xE0U;

 const uint8_t MMDVM_GET_VERSION  = 0x00U;
@ -98,14 +102,14 @@ const uint8_t MMDVM_DEBUG4       = 0xF4U;
 const uint8_t MMDVM_DEBUG5       = 0xF5U;
 const uint8_t MMDVM_DEBUG_DUMP   = 0xFAU;

-#if defined(DRCC_DVM_NQF)
-#define	HW_TYPE	"MMDVM DRCC_DVM_NQF"
-#elif defined(DRCC_DVM_HHP446)
-#define	HW_TYPE	"MMDVM DRCC_DVM_HHP(446)"
-#elif defined(DRCC_DVM_722)
-#define HW_TYPE "MMDVM RB_STM32_DVM(722)"
-#elif defined(DRCC_DVM_446)
-#define HW_TYPE "MMDVM RB_STM32_DVM(446)"
+#if defined(ZUM_V09_PI)
+#define	HW_TYPE	"MMDVM ZUM_V0.9 (446)"
+#elif defined(ZUM_V10_PI)
+#define	HW_TYPE	"MMDVM ZUM_V1.0 (722)"
+#elif defined(RB_V3_PI)
+#define HW_TYPE "MMDVM RB_DVM_V3 (446)"
+#elif defined(RB_V5_PI)
+#define HW_TYPE "MMDVM RB_DVM_V5 (722)"
 #else
 #define	HW_TYPE	"MMDVM"
 #endif
@ -118,7 +122,7 @@ const char HARDWARE[] = concat(HW_TYPE, VERSION, GITVERSION);
 const char HARDWARE[] = concat(HW_TYPE, VERSION, __TIME__, __DATE__);
 #endif

-const uint8_t PROTOCOL_VERSION   = 2U;
+const uint8_t PROTOCOL_VERSION  = 2U;

 // Parameters for batching serial data
 const int      MAX_SERIAL_DATA  = 250;
@ -129,6 +133,9 @@ m_host(PIN_HOST_RXD, PIN_HOST_TXD),
 #if defined(SERIAL_REPEATER)
 m_rpt(PIN_RPT_RXD, PIN_RPT_TXD),
 #endif
+#if defined(I2C_REPEATER)
+m_i2CData(),
+#endif
 m_buffer(),
 m_ptr(0U),
 m_len(0U),
@ -136,9 +143,6 @@ m_debug(false),
 m_serialData(),
 m_lastSerialAvail(0),
 m_lastSerialAvailCount(0U)
-#if defined(I2C_REPEATER)
-,m_i2CData()
-#endif
 {
 }

@ -346,11 +350,17 @@ void CSerialPort::getVersion()
 #endif

  // CPU type/manufacturer. 0=Atmel ARM, 1=NXP ARM, 2=St-Micro ARM
-  reply[6U] = io.getCPU();
+  reply[6U] = 2U;

  // Reserve 16 bytes for the UDID
  ::memset(reply + 7U, 0x00U, 16U);
-  io.getUDID(reply + 7U);
+#if defined(ZUM_V09_PI) || defined(RB_V3_PI)
+  ::memcpy(reply + 7U, (void *)0x1FFF7A10, 12U);
+#elif defined(ZUM_V10_PI) || defined(RB_V5_PI)
+  ::memcpy(reply + 7U, (void *)0x1FF07A10, 12U);
+#else
+  ::memcpy(reply + 7U, "????????????", 12U);
+#endif

  uint8_t count = 23U;
  for (uint8_t i = 0U; HARDWARE[i] != 0x00U; i++, count++)
@ -898,7 +908,8 @@ void CSerialPort::start()
  #endif
 #endif
 #if defined(I2C_REPEATER)
-  m_rpt.begin(9600);
+  Wire.begin();
+  Wire.setClock(100000);
 #endif
 }

@ -989,14 +1000,14 @@ void CSerialPort::process()
  // Write any outgoing serial data
  uint16_t i2CSpace = m_i2CData.getData();
  if (i2CSpace > 0U) {
-    int avail = m_rpt.available();
+    int avail = Wire.available();
    if (avail < i2CSpace)
      i2CSpace = avail;

    for (uint16_t i = 0U; i < i2CSpace; i++) {
      uint8_t c = 0U;
      m_i2CData.get(c);
-      m_rpt.write(&c, 1U);
+      Wire.write(&c, 1U);
    }
  }
 #endif
--- a/src/SerialPort.h
+++ b/src/SerialPort.h
@ -24,7 +24,7 @@
 #include "RingBuffer.h"

 #if !defined(SERIAL_SPEED)
-#define SERIAL_SPEED 115200
+#define SERIAL_SPEED 460800
 #endif


@ -103,6 +103,9 @@ private:
  HardwareSerial m_host;
 #if defined(SERIAL_REPEATER)
  HardwareSerial m_rpt;
+#endif
+#if defined(I2C_REPEATER)
+  CRingBuffer<uint8_t> m_i2CData;
 #endif
  uint8_t        m_buffer[512U];
  uint16_t       m_ptr;
@ -111,9 +114,6 @@ private:
  CRingBuffer<uint8_t> m_serialData;
  int            m_lastSerialAvail;
  uint16_t       m_lastSerialAvailCount;
-#if defined(I2C_REPEATER)
-  CRingBuffer<uint8_t> m_i2CData;
-#endif

  void    sendACK(uint8_t type);
  void    sendNAK(uint8_t type, uint8_t err);