"""
This script is uesed for automatic implementation of Laugnuage Model Training
Version: 0.1.0
Status: prototype
Author: Fangwen Shu
email: sfw811@hotmail.com
Date: 2018/04/13
"""
import os
import shutil
import commands
import re
## Pre-processing: pick up *.textgrid data
In_path = './1_rawinput'
Out_path_TG = './1_output_TG'
# [1] get file list
file_list = os.listdir(In_path)
# [2] tell if the output folder exists
if os.path.exists(Out_path_TG):
shutil.rmtree(Out_path_TG) # delete folder
os.mkdir(Out_path_TG)
else:
os.mkdir(Out_path_TG)
# [3] preprocessing file name, save *.textgrid in a single file, under same path
for name in file_list:
if name.endswith('.TextGrid') or name.endswith('.textgrid'):
pos = name.find(".")
if name[pos+1:] == "TextGrid": # if there are files end with .TextGrid, transfer it into .textgrid
newname = name[:pos+1] + "textgrid"
os.rename(In_path + '/' + name, In_path + '/' + newname)
name = newname
shutil.copyfile(In_path + '/' + name, Out_path_TG + '/' + name)
print "------------------------------------------------>PreProcessing Down!<------------------------------------------------\n"
## Step1: textgrid2ref
# [1] delete rec.txt
path_tg2ref = './LM_shirley/env/textgrid2ref'
file_rec = path_tg2ref + '/' + 'rec.txt'
if os.path.exists(file_rec):
print "OK! exists! %s, first delete it and run the shell file \n" % file_rec
os.remove(file_rec) # delete file
else:
print "Ok! not exists! %s, then run the shell file \n" % file_rec
# [2] sh run.sh in the folder textgrid2ref
file_run = path_tg2ref + '/' + 'run.sh'
# !!notice!! (1) not forget to chmod +x run.sh, and change the dir inside run.sh
# !!notice!! (2) also change dir in .pl document
# how to run shell document by python
# chos1
# (status, output) = commands.getstatusoutput('sh %s' % file_run)
# print status, output
# chos2
# output = os.popen('sh %s' % file_run)
# print output.read()
# chos3
os.system(file_run + ' -a')
# [3] after generation of rec.txt, check and delete "text"/"unk" string, since it is useless
if os.path.exists(file_rec):
print "OK! check and delete \"text\" and \"unk\" string in rec.txt"
with open(file_rec, 'r') as fp: # "text"
all_lines1 = fp.readlines()
with open(file_rec, 'w') as fp:
for eachline in all_lines1:
a = re.sub('text', '', eachline)
fp.writelines(a)
with open(file_rec, 'r') as fp: # "unk"
all_lines2 = fp.readlines()
with open(file_rec, 'w') as fp:
for eachline in all_lines2:
b = re.sub('unk', '', eachline)
fp.writelines(b)
print "------------------------------------------------>Textgrid2ref Down!<------------------------------------------------ \n"
## Step2: buildLM
path_buildLM = './LM_shirley/env/buildLM/data'
file_rec_b = path_buildLM + '/' + 'rec.txt'
file_run_b = path_buildLM + '/' + 'run.sh'
if os.path.exists(file_rec_b):
os.remove(file_rec_b)
shutil.copyfile(file_rec, file_rec_b)
os.system(file_run_b + ' ' + file_rec_b + ' -a')
# !!notice!! not forget to change the dir under buildLM/bin and ./data, inside run.sh and .pl
print "------------------------------------------------>BuildLM Down!<------------------------------------------------ \n"
## Step3: LmAdaption
path_LmAdaption = './LM_shirley/env/LmAdaption/data'
file_final = path_buildLM + '/' + 'final.arpa'
file_run_c = path_LmAdaption + '/' + 'run.sh'
if os.path.exists(path_LmAdaption + '/' + 'final.arpa'): # only if final.arpa exist
os.remove(path_LmAdaption + '/' + 'base.arpa') # delete old base
os.rename(path_LmAdaption + '/' + 'final.arpa', path_LmAdaption + '/' + 'base.arpa') # set new base
if os.path.exists(path_LmAdaption + '/' + 'mix1.arpa'):
os.remove(path_LmAdaption + '/' + 'mix1.arpa') # delete old miex1
shutil.copyfile(file_final, path_LmAdaption + '/' + 'final.arpa') # copy final.arpa from /buildLM/data/
os.rename(path_LmAdaption + '/' + 'final.arpa', path_LmAdaption + '/' + 'mix1.arpa') # set new mix1
os.system(file_run_c + ' -a')
# !!notice!! not forget to change the dir under /bin and ./data, inside run.sh and .pl and chmod +x of run.sh
# you can change the interpolation parameters in file mix_model.txt: best lambda (0.5 0.5)
print "------------------------------------------------>LmAdaption Down!<------------------------------------------------ \n"
## Step4: BuildFSM_V1.3_linux
path_BuildFSM = './LM_shirley/env/BuildFSM_V1.3_linux'
path_NewSource = path_BuildFSM + '/' + 'NewSource'
path_final_ = path_LmAdaption + '/' + 'final.arpa'
path_RunBuild = path_BuildFSM + '/' + 'RunBuildWFST_V1.31.sh'
if os.path.exists(path_NewSource + '/' + 'lm.arpa'):
os.remove(path_NewSource + '/' + 'lm.arpa')
shutil.copyfile(path_final_, path_NewSource + '/' + 'lm.arpa')
os.system(path_RunBuild + ' -a')
# !!notice!! change all relative dir and lib!!! quite a lot
print "------------------------------------------------>BuildFSM_V1.3_linux Down!<------------------------------------------------ \n"
step1_test_2.py
最新推荐文章于 2023-06-05 09:28:06 发布