kaldi sre16/v1中utils/combine_data.sh

#!/bin/bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
#           2014  David Snyder

# This script combines the data from multiple source directories into
# a single destination directory.

# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information
# about what these directories contain.

# Begin configuration section.
extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..."
skip_fix=false # skip the fix_data_dir.sh in the end
# End configuration section.
# 这两个设置最下面都有用到

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -lt 2 ]; then
  echo "Usage: combine_data.sh [--extra-files 'file1 file2'] <dest-data-dir> <src-data-dir1> <src-data-dir2> ..."
  echo "Note, files that don't appear in all source dirs will not be combined,"
  echo "with the exception of utt2uniq and segments, which are created where necessary."
  exit 1
fi
# 至少要有俩参数

dest=$1;
shift;
# 把第一个参数的位置向后挪

first_src=$1;
# 第一个源文件夹

rm -r $dest 2>/dev/null
mkdir -p $dest;
# 如果目的文件夹存在的话,就删除它,然后新建一个目录

export LC_ALL=C
# LC_ALL=C 是为了去除所有本地化的设置,让命令能正确执行。
# LC,它是Linux系统中多语言环境的设置接口,其它请自行百度

# $* 所有参数列表,下面if主要保证所有源文件夹中都要有utt2spk这个文件
for dir in $*; do
  if [ ! -f $dir/utt2spk ]; then
    echo "$0: no such file $dir/utt2spk"
    exit 1;
  fi
done

# w.r.t. : with respect/regard to 的缩写。是 关于;谈及,谈到的意思

# W.r.t. utt2uniq file the script has different behavior compared to other files
# it is not compulsary义务的;必xu的;被强制的 for it to exist in src directories, but if it exists in
# even one it should exist in all. We will create the files where necessary
# utt2uniq这个文件,要么都没有的,要么都要有

has_utt2uniq=false
for in_dir in $*; do
  if [ -f $in_dir/utt2uniq ]; then
    has_utt2uniq=true
    break
  fi
done

if $has_utt2uniq; then
  # we are going to create an utt2uniq file in the destdir
  for in_dir in $*; do
    if [ ! -f $in_dir/utt2uniq ]; then
      # we assume that utt2uniq is a one to one mapping
      cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}'
    else
      cat $in_dir/utt2uniq
    fi
  done | sort -k1 > $dest/utt2uniq
  echo "$0: combined utt2uniq"
else
  echo "$0 [info]: not combining utt2uniq as it does not exist"
fi
# some of the old scripts might provide utt2uniq as an extrafile, so just remove it

extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g")
# 这个变量是一开始时候设置的

# segments are treated similarly to utt2uniq. If it exists in some, but not all
# src directories, then we generate segments where necessary.
# 要么都有,要么都没有
has_segments=false
for in_dir in $*; do
  if [ -f $in_dir/segments ]; then
    has_segments=true
    break
  fi
done

if $has_segments; then
  for in_dir in $*; do
    if [ ! -f $in_dir/segments ]; then
      echo "$0 [info]: will generate missing segments for $in_dir" 1>&2
      utils/data/get_segments_for_data.sh $in_dir
    else
      cat $in_dir/segments
    fi
  done | sort -k1 > $dest/segments
  echo "$0: combined segments"
else
  echo "$0 [info]: not combining segments as it does not exist"
fi

# 检查这么多文件是否存在
for file in utt2spk utt2lang utt2dur reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
  exists_somewhere=false
  absent_somewhere=false
  for d in $*; do
    if [ -f $d/$file ]; then
      exists_somewhere=true
    else
      absent_somewhere=true
      fi
  done

# 下面两句话仅供参考
# set -e表示一旦脚本中有命令的返回值为非0,则脚本立即退出,后续命令不再执行;
# set -o pipefail表示在管道连接的命令序列中,只要有任何一个命令返回非0值,则整个管道返回非0值,即使最后一个命令返回0

  if ! $absent_somewhere; then
    set -o pipefail
    ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1;
    set +o pipefail
    echo "$0: combined $file"
  else
    if ! $exists_somewhere; then
      echo "$0 [info]: not combining $file as it does not exist"
    else
      echo "$0 [info]: **not combining $file as it does not exist everywhere**"
    fi
  fi
done

# 自动生成spk2utt文件
utils/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt

# 如果需要跳过fix_data_dir.sh这个文件,即不执行这个文件,
#(参数是目的文件夹,里面有combine好的数据)
# 一开始的时候要设置skip_fix=true
# 默认是执行这个文件的,因为默认是false
# fix_data_dir.sh 会使utterence的总数减少
if ! $skip_fix ; then
  utils/fix_data_dir.sh $dest || exit 1;
fi

exit 0

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值