根据语音信号能量的大小切分.raw文件(Linux)
/*
# Filename: cut_speech.c
# Revision: 3
# Purpose : Read RAW-format data from stdin and extract speech
# Date : 2018/01/03
# Usage : cutsp -w <window size> -f <frame shift> -s <frequency> -u <unvoice_term> -v <voice_term> -n <unvoice_threshold> -t <voice_threshold>
# Supposed: cutsp -w 100 -f 10 -s 16000 -u 200 -v 50 -t 1000 -n 1000
# : Using DAT-link+
# : 16 bit sampling
# : option:
# -w <window size> [msec] ;100
# -f <frame shift> ;10
# -s <frequency> ;16000
# -u <unvoice term> ;200
# -v <voice term> ;50
# -n <threshold>
# -t <threshold>
# Original: extract_speech.c (/dD/kikuchi/src/dialogue/extract_speech)
detect.c (/dD/kikuchi/src/dialogue/agent/proto/rev2)
*/
#include <stdio.h>
#include <math.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <malloc.h>
#define MAXUTTSIZE 1000
#define SAMPSIZE 4800 /* sampling_frequency=48[kHz], window_size=100[msec] */
#define MAXSAMPSIZE 100000000 /* length=30[min]=1800[sec], sampling_frequency=48[kHz] */
#define MAXSPEECHSIZE 1440000 /* length=30[sec],sampling_frequency=48[kHz] */
static short idata[SAMPSIZE];
short buf[MAXSPEECHSIZE];
short new_idata[SAMPSIZE],tmp_idata[SAMPSIZE];
short *all_idata;
int main(int argc, char *argv[])
{
/* DECLARATION OF VARIABLES */
int frame_shift;
int window_size;
int frequency;
int unvoice_term;
int voice_term;
int v_threshold;
int u_threshold;
int start_frame;
int end_frame;
int nframp;
int block_points;
int speech_flag=0;
int unvoice_time=0;
int voice_time=0;
double power;
int a = 0;
int i, j, k, m, n;
FILE *utt_fp[MAXUTTSIZE];
char *utt_fname[MAXUTTSIZE];
int utt_num = 1;
char sPath[100], delefile[100];
int sPathLen;
/* SET DEFAULT VALUE */
frame_shift = 10; /* 10 msec */
window_size = 100; /* 200 msec */
frequency = 12000; /* 16000 Hz */
unvoice_term = 200; /* 200 msec */
voice_term = 50; /* 50 msec */
u_threshold = 1000; /* ??? */
v_threshold = 1000; /* ??? */
/* SET OPTION FROM ARGUMENTS */
if (argc < 2) usage();
for (i = 1; i < argc; i++) {
switch(argv[i][0]) {
case '-':
switch(argv[i][1]) {
case 'w':
sscanf(argv[++i], "%d", &window_size);
break;
case 'f':
sscanf(argv[++i], "%d", &frame_shift);
break;
case 's':
sscanf(argv[++i], "%d", &frequency);
break;
case 'u':
sscanf(argv[++i], "%d", &unvoice_term);
break;
case 'v':
sscanf(argv[++i], "%d", &voice_term);
break;
case 't':
sscanf(argv[++i], "%d", &v_threshold);
break;
case 'n':
sscanf(argv[++i], "%d", &u_threshold);
break;
case 'p':
sscanf(argv[++i], "%s",sPath);
sPathLen=strlen(sPath);
break;
default:
usage();
break;
}
break;
default:
usage();
break;
}
}
/* SET INITIAL VALUE */
nframp = frame_shift * frequency / 1000;
block_points = window_size/frame_shift * nframp;
/* START EXTRACTION */
fprintf(stderr, "start extracting speech...\n");
/* OPEN FILE FOR OUTPUT WAVEFORM DATA */
utt_fname[utt_num] = malloc(sPathLen+strlen("001.raw"));
sprintf(utt_fname[utt_num],"%s%.3d.raw",sPath,utt_num);
utt_fp[utt_num] = fopen(utt_fname[utt_num], "w");
power=0.0;
fread((char *)idata, 2, block_points, stdin);
for (i = 0; i < block_points; i++) {
power += ((double) idata[i]) * ((double) idata[i]);
}
power /= (double) block_points;
power = sqrt(power);
all_idata=(short *)malloc(sizeof(short)*MAXSAMPSIZE);
for(i=0;i<block_points;i++){
all_idata[a++]=idata[i];
}
if(power < v_threshold){
start_frame=0;
}else{
start_frame=1;
}
/* READ BLOCKS OF WAVEFORM */
for (i = 1;
(fread((char *)idata, 2, nframp, stdin) == nframp);
i++) {
for(j=0;j<nframp;j++){
all_idata[a++]=idata[j];
}
k=0;
for(j=0;j<block_points;j++){
new_idata[k++]=all_idata[j+i*nframp];
}
power=0.0;
for (j = 0; j < block_points; j++) {
power += ((double) new_idata[j]) * ((double) new_idata[j]);
}
power /= (double) block_points;
power = sqrt(power);
#ifdef DEBUG
fprintf(stderr, "power[%d][%d]-[%d]:[%f]",i,i*frame_shift,i*frame_shift+window_size,power);
#endif
/* JUDGE POWER LEVEL */
if(speech_flag){
if (power < u_threshold){
unvoice_time+=frame_shift;
voice_time=0;
if(unvoice_time == unvoice_term){
#ifdef DEBUG
fprintf(stderr, "utterance %.3d ends.(time=[%d])\n", utt_num, i*frame_shift+window_size-unvoice_term);
#endif
speech_flag = 0;
voice_time=0;
n=0;
for (m=start_frame*nframp-window_size/frame_shift*nframp; m<(i-unvoice_term/frame_shift+window_size/frame_shift)*nframp;m++) {
buf[n++] = all_idata[m];
}
fwrite(&buf[0], 2, n, utt_fp[utt_num]);
fflush(utt_fp[utt_num]);
#ifdef DEBUG
fprintf(stderr, "finish to write to %s.\n",utt_fname[utt_num]);
#endif
fprintf(stdout, "%04d %.3f-%.3f\n",utt_num,
(float)(start_frame*frame_shift-window_size)/1000,
(float)(i-unvoice_term/frame_shift+window_size/frame_shift)*frame_shift/1000);
fclose(utt_fp[utt_num]);
free(utt_fname[utt_num]);
utt_num++;
utt_fname[utt_num] = malloc(sPathLen+strlen("001.raw"));
sprintf(utt_fname[utt_num],"%s%.3d.raw",sPath,utt_num);
utt_fp[utt_num] = fopen(utt_fname[utt_num], "w");
}else{
#ifdef DEBUG
fprintf(stderr, "can't judge.(unvoice[%d], voice[%d])\n",unvoice_time,voice_time);
#endif
}
}else{
voice_time+=frame_shift;
unvoice_time=0;
#ifdef DEBUG
fprintf(stderr, "in utterance.(unvoice[%d], voice[%d])\n",unvoice_time,voice_time);
#endif
}
}else{
if (power < v_threshold){ ったら */
unvoice_time+=frame_shift;
voice_time=0;
start_frame = i+1;
#ifdef DEBUG
fprintf(stderr, "no utterance.(unvoice[%d], voice[%d])\n",unvoice_time,voice_time);
#endif
}else{
voice_time+=frame_shift;
unvoice_time=0;
if(voice_time >= voice_term){
#ifdef DEBUG
fprintf(stderr, "utterance %.3d starts.\n", utt_num);
fprintf(stderr, "start to write %s.\n", utt_fname[utt_num]);
#endif
speech_flag = 1;
}
else{
#ifdef DEBUG
fprintf(stderr, "can't judge.(unvoice[%d], voice[%d])\n",unvoice_time,voice_time);
#endif
}
}
}
}
fprintf(stderr, "done.\n");
fclose(utt_fp[utt_num]);
sprintf(delefile,"rm -rf %s",utt_fname[utt_num]);
system(delefile);
free(utt_fname[utt_num++]);
return 0;
}
usage()
{
fprintf(stderr,
"Usage: cutsp -w window_size -f frame_shift -s frequency -u unvoice_term -v voice_term -t thresh\n");
exit(1);
}