CMSIS-DSP FFT Benchmark

文章详细分析了STM32G431RB微控制器上使用IAR编译器进行定点和浮点快速傅立叶变换(RFFT)的性能,包括时间消耗、结果表示以及数据定标问题,还提供了测试代码示例。
摘要由CSDN通过智能技术生成

Test Condition

MCU: STM32G431RB @170MHz
IDE: IAR V9.40
Optimization: -o3

Benchmark

Time consuming

Float32Q31Q15
Duration Without Calc Mag (us)491216591001
Duration Calc Mag (us)594819431243

Result representation

Float32Q31Q15
Input DC641024128
Output DC~128~1024~128
Input AC102410241024
Output AC~1024~512~512

Key points

  • 定点RFFT的输出Buffer长度必须是输入的2倍 (浮点没有这个要求),这个没有在源代码里说明,只在官方网页上有一行描述:

Official page
If the input buffer is of length N (fftLenReal), the output buffer must have length 2N since it is containing the conjugate part (except for MVE version where N+2 is enough). The input buffer is modified by this function.
For the RIFFT, the source buffer must have length N+2 since the Nyquist frequency value is needed but conjugate part is ignored. It is not using the packing trick of the float version.

  • 关于数据定标,如果输入的数据格式是Q15,则FFT的结果已经不是Q15了,已经放大了(同样见Official page表格)。比如2048点的Q15输入,输出变成了Q4。个人认为这种定标没什么意义,还不如直接给出结果跟原始数据的关系,因为这种系数纯粹是计算过程引入的,并没有什么物理意义,而且还随点数的变化而变化。有可能不管点数多少,输入幅值和输出幅值都是确定的关系,那这种定标就更没意义了。按前面实测表格,定点数的输出DC值与输入DC值接近,而输出AC幅值则约为输入幅值的一半;而浮点格式下输出DC值约为输入的2倍,而输出AC幅值与输入AC幅值接近。
  • 官方的取模函数是个大坑,它把实部和虚部求平方和后把这个平方和的结果归一回Q14(右移17位),然后再求开方,这样平方和结果小于17位的输入就都被移成0了。对RFFT来说,虚部接近于零,因此小于9位的数据基本都被移为0了。即使较大的数开方后的结果也变得很小了。初步比较,发现计算过程是平方和右移17位,开方后左移6位,或者说是平方和右移5位再开方。迷之操作。

Test Code

/* USER CODE BEGIN Header */
/**
  ******************************************************************************
  * @file           : main.c
  * @brief          : Main program body
  ******************************************************************************
  * @attention
  *
  * Copyright (c) 2024 STMicroelectronics.
  * All rights reserved.
  *
  * This software is licensed under terms that can be found in the LICENSE file
  * in the root directory of this software component.
  * If no LICENSE file comes with this software, it is provided AS-IS.
  *
  ******************************************************************************
  */
/* USER CODE END Header */
/* Includes ------------------------------------------------------------------*/
#include "main.h"

#define GENERATE_INPUT_ONLINE
// #define Q31_TEST

// #define USER_MAG_CALC

/* Private includes ----------------------------------------------------------*/
/* USER CODE BEGIN Includes */
#include <stdbool.h>
#include "arm_math.h"
#ifndef GENERATE_INPUT_ONLINE
#include "iInputData.h"
#endif
#include "arm_const_structs.h"
/* USER CODE END Includes */

/* Private typedef -----------------------------------------------------------*/
/* USER CODE BEGIN PTD */

/* USER CODE END PTD */

/* Private define ------------------------------------------------------------*/
/* USER CODE BEGIN PD */
/* USER CODE END PD */

/* Private macro -------------------------------------------------------------*/
/* USER CODE BEGIN PM */

/* USER CODE END PM */

/* Private variables ---------------------------------------------------------*/

/* USER CODE BEGIN PV */

/* USER CODE END PV */

/* Private function prototypes -----------------------------------------------*/
void SystemClock_Config(void);
static void MX_CORDIC_Init(void);
/* USER CODE BEGIN PFP */
#define   N                 2048
#define   SAMPLE_FREQUENCY  1000


#ifdef FLOAT_TEST
float32_t fInputData[N];
float32_t fOutputData[N+2];
float32_t fMag[N+1];

#elif defined(Q31_TEST)

// arm_rfft_instance_q15 rfftInstance;
#ifdef GENERATE_INPUT_ONLINE
#pragma data_alignment=16
static q31_t iInputData[N];
#endif

#pragma data_alignment=16
q31_t iOutputData[N*2];
q31_t iFFT_Mag[(N>>1)+1];

#else

// arm_rfft_instance_q15 rfftInstance;
#ifdef GENERATE_INPUT_ONLINE
#pragma data_alignment=16
static q15_t iInputData[N];
#endif

#pragma data_alignment=16
q15_t iOutputData[N*2];
#ifdef USER_MAG_CALC
q31_t iFFT_Mag[(N>>1)+1];
#else
q15_t iFFT_Mag[(N>>1)+1];
#endif
#endif

uint16_t frequency1 = 8; // Hz
uint16_t frequency2 = 32; // Hz
uint16_t amplitude1 = 1024;
uint16_t amplitude2 = 256;
q15_t iOffset = 128;

uint32_t DurationUs = 0;

uint16_t binIndexScale = (1UL << 16) / N;
uint16_t bin1Index = 1;
uint16_t bin2Index = 2;

bool calcMag = false;
bool runOnce = false;

#ifdef FLOAT_TEST
extern const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len2048;
#elif defined(Q31_TEST)
extern const arm_rfft_instance_q31 arm_rfft_sR_q31_len2048;
#else
extern const arm_rfft_instance_q15 arm_rfft_sR_q15_len2048;
#endif
/* USER CODE END PFP */

/* Private user code ---------------------------------------------------------*/
/* USER CODE BEGIN 0 */

#ifndef FLOAT_TEST
void user_cmplx_mag_q15(q15_t* pSrc, q31_t* pDst, uint16_t numSamples)
{
#if defined (ARM_MATH_DSP)
        q31_t in;
        q31_t acc0;                                    /* Accumulators */
#else
       q15_t real, imag;                              /* Temporary input variables */
       q31_t acc0, acc1;                              /* Accumulators */
#endif
  uint16_t blkCnt = numSamples;

  while (blkCnt > 0U)
  {
    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */

#if defined (ARM_MATH_DSP)
    in = read_q15x2_ia ((q15_t **) &pSrc);
    acc0 = __SMUAD(in, in);

    /* store result in 2.14 format in destination buffer. */
    // arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
    *pDst++ = acc0;
#else
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = ((q31_t) real * real);
    acc1 = ((q31_t) imag * imag);

    /* store result in 2.14 format in destination buffer. */
    // arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
#endif

    /* Decrement loop counter */
    blkCnt--;
  }
}
#endif

void TimeMeasureInit(void)
{
  CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk;

  if (DWT->CTRL != 0U)
  {                                        /* Check if DWT is present. */
    DWT->CYCCNT  = 0;
    DWT->CTRL   |= DWT_CTRL_CYCCNTENA_Msk; /* Enable Cycle Counter. */
  }

}

void Setup(void)
{
  /* Cordic setup */
  LL_CORDIC_Config( CORDIC, 
                  LL_CORDIC_FUNCTION_SINE, 
                  LL_CORDIC_PRECISION_5CYCLES,
                  LL_CORDIC_SCALE_0,
                  LL_CORDIC_NBWRITE_1,
                  LL_CORDIC_NBREAD_1,
                  LL_CORDIC_INSIZE_16BITS,
                  LL_CORDIC_OUTSIZE_16BITS);

  /* setup time measurement */
  TimeMeasureInit();

  /* Setup fft instance */
  // arm_rfft_init_q15(&rfftInstance, N, false, true);

}


void FFT_GenerateDate(void)
{
  // uint32_t deltaTheta1 = (uint64_t)UINT32_MAX * frequency1 / SAMPLE_FREQUENCY;
  // uint32_t deltaTheta2 = (uint64_t)UINT32_MAX * frequency2 / SAMPLE_FREQUENCY;
  for (int i = 0; i < N; i++) {
    // uint32_t theta1 = ((int64_t)i * deltaTheta1) >> 16; // convert to q15
    uint32_t theta1 = i * bin1Index * binIndexScale; // convert to q15
    int16_t d;
    // float32_t f;
    uint32_t arg = ((uint32_t)amplitude1 << 16) + (uint16_t)theta1;
    LL_CORDIC_WriteData(CORDIC, arg);
    arg = LL_CORDIC_ReadData(CORDIC);
    d = ((arg >> 16) & 0xFFFF) + iOffset;

    uint32_t theta2 = i * bin2Index * binIndexScale; // convert to q15
    arg = ((uint32_t)amplitude2 << 16) + (uint16_t)theta2;
    LL_CORDIC_WriteData(CORDIC, arg);
    arg = LL_CORDIC_ReadData(CORDIC);
    d += ((arg >> 16) & 0xFFFF);

    #ifdef FLOAT_TEST
    fInputData[i] = (float32_t)d/amplitude1;
    #else
    iInputData[i] = d;
    #endif
  }

}

#ifndef FLOAT_TEST
void iFFTCalc(void)
{
  #ifdef Q31_TEST

  arm_rfft_q31(&arm_rfft_sR_q31_len2048, iInputData, iOutputData);

  if (calcMag) {
    arm_cmplx_mag_q31(iOutputData, iFFT_Mag, N/2 + 1);
  }

  #else

  arm_rfft_q15(&arm_rfft_sR_q15_len2048, iInputData, iOutputData);

  if (calcMag) {
    #ifdef USER_MAG_CALC
    user_cmplx_mag_q15(iOutputData, iFFT_Mag, N/2 + 1);
    #else
    arm_cmplx_mag_q15(iOutputData, iFFT_Mag, N/2 + 1);
    #endif

  }

  #endif

}
#endif


#ifdef FLOAT_TEST
void fFFTCalc(void)
{
  arm_rfft_fast_f32(&arm_rfft_fast_sR_f32_len2048, fInputData, fOutputData, 0);
  if (calcMag) {
    arm_cmplx_mag_f32(fOutputData, fMag, N);
  }
}
#endif


uint32_t TimeMeasure(uint32_t startTick, uint32_t stopTick)
{
  uint32_t deltaTick;
  if (stopTick < startTick)
  {
    deltaTick = (UINT32_MAX - startTick) + stopTick;
  }
  else
  {
    deltaTick = stopTick - startTick;
  }
  return (deltaTick / 170);

}


/* USER CODE END 0 */

/**
  * @brief  The application entry point.
  * @retval int
  */
int main(void)
{
  /* USER CODE BEGIN 1 */

  /* USER CODE END 1 */

  /* MCU Configuration--------------------------------------------------------*/

  /* Reset of all peripherals, Initializes the Flash interface and the Systick. */
  HAL_Init();

  /* USER CODE BEGIN Init */

  /* USER CODE END Init */

  /* Configure the system clock */
  SystemClock_Config();

  /* USER CODE BEGIN SysInit */

  /* USER CODE END SysInit */

  /* Initialize all configured peripherals */
  MX_CORDIC_Init();
  /* USER CODE BEGIN 2 */
  SysTick->CTRL &=~SysTick_CTRL_TICKINT_Msk;
  Setup();
  
  /* USER CODE END 2 */

  /* Infinite loop */
  /* USER CODE BEGIN WHILE */
  while (1)
  {
    /* USER CODE END WHILE */
    if (runOnce) {
      #ifdef GENERATE_INPUT_ONLINE
      FFT_GenerateDate();
      #endif

      uint32_t startTick = DWT->CYCCNT;
#ifdef FLOAT_TEST
      fFFTCalc();
#else
      iFFTCalc();
#endif
      uint32_t stopTick = DWT->CYCCNT;
      DurationUs = TimeMeasure(startTick, stopTick);

      runOnce = false;
    }

    /* USER CODE BEGIN 3 */
  }
  /* USER CODE END 3 */
}

/**
  * @brief System Clock Configuration
  * @retval None
  */
void SystemClock_Config(void)
{
  RCC_OscInitTypeDef RCC_OscInitStruct = {0};
  RCC_ClkInitTypeDef RCC_ClkInitStruct = {0};

  /** Configure the main internal regulator output voltage
  */
  HAL_PWREx_ControlVoltageScaling(PWR_REGULATOR_VOLTAGE_SCALE1);

  /** Initializes the RCC Oscillators according to the specified parameters
  * in the RCC_OscInitTypeDef structure.
  */
  RCC_OscInitStruct.OscillatorType = RCC_OSCILLATORTYPE_HSI;
  RCC_OscInitStruct.HSIState = RCC_HSI_ON;
  RCC_OscInitStruct.HSICalibrationValue = RCC_HSICALIBRATION_DEFAULT;
  RCC_OscInitStruct.PLL.PLLState = RCC_PLL_NONE;
  if (HAL_RCC_OscConfig(&RCC_OscInitStruct) != HAL_OK)
  {
    Error_Handler();
  }

  /** Initializes the CPU, AHB and APB buses clocks
  */
  RCC_ClkInitStruct.ClockType = RCC_CLOCKTYPE_HCLK|RCC_CLOCKTYPE_SYSCLK
                              |RCC_CLOCKTYPE_PCLK1|RCC_CLOCKTYPE_PCLK2;
  RCC_ClkInitStruct.SYSCLKSource = RCC_SYSCLKSOURCE_HSI;
  RCC_ClkInitStruct.AHBCLKDivider = RCC_SYSCLK_DIV1;
  RCC_ClkInitStruct.APB1CLKDivider = RCC_HCLK_DIV1;
  RCC_ClkInitStruct.APB2CLKDivider = RCC_HCLK_DIV1;

  if (HAL_RCC_ClockConfig(&RCC_ClkInitStruct, FLASH_LATENCY_0) != HAL_OK)
  {
    Error_Handler();
  }
}

/**
  * @brief CORDIC Initialization Function
  * @param None
  * @retval None
  */
static void MX_CORDIC_Init(void)
{

  /* USER CODE BEGIN CORDIC_Init 0 */

  /* USER CODE END CORDIC_Init 0 */

  /* Peripheral clock enable */
  LL_AHB1_GRP1_EnableClock(LL_AHB1_GRP1_PERIPH_CORDIC);

  /* USER CODE BEGIN CORDIC_Init 1 */

  /* USER CODE END CORDIC_Init 1 */

  /* nothing else to be configured */

  /* USER CODE BEGIN CORDIC_Init 2 */

  /* USER CODE END CORDIC_Init 2 */

}

/* USER CODE BEGIN 4 */

/* USER CODE END 4 */

/**
  * @brief  This function is executed in case of error occurrence.
  * @retval None
  */
void Error_Handler(void)
{
  /* USER CODE BEGIN Error_Handler_Debug */
  /* User can add his own implementation to report the HAL error return state */
  __disable_irq();
  while (1)
  {
  }
  /* USER CODE END Error_Handler_Debug */
}

#ifdef  USE_FULL_ASSERT
/**
  * @brief  Reports the name of the source file and the source line number
  *         where the assert_param error has occurred.
  * @param  file: pointer to the source file name
  * @param  line: assert_param error line source number
  * @retval None
  */
void assert_failed(uint8_t *file, uint32_t line)
{
  /* USER CODE BEGIN 6 */
  /* User can add his own implementation to report the file name and line number,
     ex: printf("Wrong parameters value: file %s on line %d\r\n", file, line) */
  /* USER CODE END 6 */
}
#endif /* USE_FULL_ASSERT */

  • 21
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值