语音切分程序

最新推荐文章于 2024-01-24 11:58:01 发布

chf304071711

最新推荐文章于 2024-01-24 11:58:01 发布

阅读量2.4k

点赞数

分类专栏：语音识别算法文章标签：语音切分

本文链接：https://blog.csdn.net/chf304071711/article/details/78966150

版权

语音识别同时被 2 个专栏收录

4 篇文章 0 订阅

订阅专栏

算法

2 篇文章 0 订阅

订阅专栏

根据语音信号能量的大小切分.raw文件(Linux)

/*
# Filename: cut_speech.c
# Revision: 3
# Purpose : Read RAW-format data from stdin and extract speech
# Date    : 2018/01/03
# Usage   : cutsp -w <window size> -f <frame shift> -s <frequency> -u <unvoice_term> -v <voice_term> -n <unvoice_threshold> -t <voice_threshold>
# Supposed: cutsp -w 100 -f 10 -s 16000 -u 200 -v 50 -t 1000 -n 1000
#         : Using DAT-link+
#         : 16 bit sampling
#         : option:
#                  -w <window size> [msec] ;100
#                  -f <frame shift>        ;10
#                  -s <frequency>          ;16000
#                  -u <unvoice term>     ;200
#                  -v <voice term>     ;50
#                  -n <threshold> 
#                  -t <threshold> 
# Original: extract_speech.c (/dD/kikuchi/src/dialogue/extract_speech)
            detect.c (/dD/kikuchi/src/dialogue/agent/proto/rev2)
*/

#include <stdio.h>
#include <math.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <malloc.h>

#define MAXUTTSIZE 1000
#define SAMPSIZE 4800   /* sampling_frequency=48[kHz], window_size=100[msec] */
#define MAXSAMPSIZE 100000000  /* length=30[min]=1800[sec], sampling_frequency=48[kHz] */
#define MAXSPEECHSIZE 1440000  /* length=30[sec],sampling_frequency=48[kHz] */

static short idata[SAMPSIZE];
short buf[MAXSPEECHSIZE];
short new_idata[SAMPSIZE],tmp_idata[SAMPSIZE];
short *all_idata;

int main(int argc, char *argv[])
{
   /* DECLARATION OF VARIABLES */
   int frame_shift;    
   int window_size;  
   int frequency;      
   int unvoice_term;    
   int voice_term;     
   int v_threshold;      
   int u_threshold;       

   int start_frame;    
   int end_frame;     
   int nframp;          
   int block_points;   
   int speech_flag=0;   
   int unvoice_time=0; 
   int voice_time=0;    
   double power;       

   int a = 0;
   int i, j, k, m, n;
   FILE *utt_fp[MAXUTTSIZE];
   char *utt_fname[MAXUTTSIZE];
   int utt_num = 1;

   char sPath[100], delefile[100];
   int  sPathLen;

   /* SET DEFAULT VALUE */
   frame_shift = 10;    /* 10 msec */
   window_size = 100;   /* 200 msec */
   frequency = 12000;   /* 16000 Hz */
   unvoice_term = 200;  /* 200 msec */
   voice_term = 50;     /* 50 msec */
   u_threshold = 1000;  /* ??? */
   v_threshold = 1000;  /* ??? */

   /* SET OPTION FROM ARGUMENTS */
   if (argc < 2) usage();
   for (i = 1; i < argc; i++) {
     switch(argv[i][0]) {
     case '-':
       switch(argv[i][1]) {
       case 'w':
     sscanf(argv[++i], "%d", &window_size);
     break;
       case 'f':
     sscanf(argv[++i], "%d", &frame_shift);
     break;
       case 's':
     sscanf(argv[++i], "%d", &frequency);
     break;
       case 'u':
     sscanf(argv[++i], "%d", &unvoice_term);
     break;
       case 'v':
     sscanf(argv[++i], "%d", &voice_term);
     break;
       case 't':
     sscanf(argv[++i], "%d", &v_threshold);
     break;
       case 'n':
     sscanf(argv[++i], "%d", &u_threshold);
     break;
       case 'p':
     sscanf(argv[++i], "%s",sPath);
         sPathLen=strlen(sPath);

     break;

       default:
     usage();
     break;
       }
       break;
     default:
       usage();
       break;
     }
   }

   /* SET INITIAL VALUE */
   nframp = frame_shift * frequency / 1000;
   block_points = window_size/frame_shift * nframp;


   /* START EXTRACTION */
   fprintf(stderr, "start extracting speech...\n");

   /* OPEN FILE FOR OUTPUT WAVEFORM DATA */
   utt_fname[utt_num] = malloc(sPathLen+strlen("001.raw"));
   sprintf(utt_fname[utt_num],"%s%.3d.raw",sPath,utt_num);
   utt_fp[utt_num] = fopen(utt_fname[utt_num], "w");

   power=0.0;
   fread((char *)idata, 2, block_points, stdin);
   for (i = 0; i < block_points; i++) {
     power += ((double) idata[i]) * ((double) idata[i]);
   }
   power /= (double) block_points;
   power = sqrt(power);


   all_idata=(short *)malloc(sizeof(short)*MAXSAMPSIZE);
   for(i=0;i<block_points;i++){
     all_idata[a++]=idata[i];
   }
   if(power < v_threshold){
     start_frame=0;
   }else{
     start_frame=1;
   }


   /* READ BLOCKS OF WAVEFORM */
   for (i = 1; 
    (fread((char *)idata, 2, nframp, stdin) == nframp);
    i++) {

     for(j=0;j<nframp;j++){
       all_idata[a++]=idata[j];
     }

     k=0;
     for(j=0;j<block_points;j++){
       new_idata[k++]=all_idata[j+i*nframp];
     }

     power=0.0;
     for (j = 0; j < block_points; j++) {
       power += ((double) new_idata[j]) * ((double) new_idata[j]);
     }
     power /= (double) block_points;
     power = sqrt(power);

#ifdef DEBUG
     fprintf(stderr, "power[%d][%d]-[%d]:[%f]",i,i*frame_shift,i*frame_shift+window_size,power);
#endif

     /* JUDGE POWER LEVEL */
     if(speech_flag){ 
       if (power < u_threshold){
     unvoice_time+=frame_shift; 
     voice_time=0;

     if(unvoice_time == unvoice_term){ 
#ifdef DEBUG
       fprintf(stderr, "utterance %.3d ends.(time=[%d])\n", utt_num, i*frame_shift+window_size-unvoice_term);
#endif
       speech_flag = 0; 
       voice_time=0;

       n=0;
       for (m=start_frame*nframp-window_size/frame_shift*nframp; m<(i-unvoice_term/frame_shift+window_size/frame_shift)*nframp;m++) {
         buf[n++] = all_idata[m];
       }

       fwrite(&buf[0], 2, n, utt_fp[utt_num]);
       fflush(utt_fp[utt_num]);
#ifdef DEBUG
       fprintf(stderr, "finish to write to %s.\n",utt_fname[utt_num]);
#endif
       fprintf(stdout, "%04d %.3f-%.3f\n",utt_num,
           (float)(start_frame*frame_shift-window_size)/1000,
           (float)(i-unvoice_term/frame_shift+window_size/frame_shift)*frame_shift/1000);

       fclose(utt_fp[utt_num]);
       free(utt_fname[utt_num]);

       utt_num++;
       utt_fname[utt_num] = malloc(sPathLen+strlen("001.raw"));
       sprintf(utt_fname[utt_num],"%s%.3d.raw",sPath,utt_num);
       utt_fp[utt_num] = fopen(utt_fname[utt_num], "w");

     }else{
#ifdef DEBUG
       fprintf(stderr, "can't judge.(unvoice[%d], voice[%d])\n",unvoice_time,voice_time);
#endif
     }     
       }else{ 
     voice_time+=frame_shift;
     unvoice_time=0;
#ifdef DEBUG
     fprintf(stderr, "in utterance.(unvoice[%d], voice[%d])\n",unvoice_time,voice_time);
#endif
       }
     }else{
       if (power < v_threshold){ ったら */
     unvoice_time+=frame_shift;
     voice_time=0;
     start_frame = i+1; 
#ifdef DEBUG
     fprintf(stderr, "no utterance.(unvoice[%d], voice[%d])\n",unvoice_time,voice_time);
#endif
       }else{
     voice_time+=frame_shift;
     unvoice_time=0;
     if(voice_time >= voice_term){
#ifdef DEBUG
       fprintf(stderr, "utterance %.3d starts.\n", utt_num);
       fprintf(stderr, "start to write %s.\n", utt_fname[utt_num]);
#endif 
       speech_flag = 1; 
     }
     else{
#ifdef DEBUG
       fprintf(stderr, "can't judge.(unvoice[%d], voice[%d])\n",unvoice_time,voice_time);
#endif
     }
       }
     }
   }

   fprintf(stderr, "done.\n");
   fclose(utt_fp[utt_num]);

sprintf(delefile,"rm -rf %s",utt_fname[utt_num]);
system(delefile);

   free(utt_fname[utt_num++]);

   return 0;
}

usage()
{
  fprintf(stderr, 
      "Usage: cutsp -w window_size -f frame_shift -s frequency -u unvoice_term -v voice_term -t thresh\n");
  exit(1);
}