1.在Linus 上安装crf++
1)比较常用的工具就是CRF++。官网地址为:https://taku910.github.io/crfpp/ 选择CRF+±0.58.tar.gz
2)tar zxvf CRF+±0.58.tar.gz 进入CRF+±0.58目录下
3) ./configure
4)make
5)su
6)make install
注意:这里用crf++ 工具跑出的数据都是在所在安装包目录下跑的数据。不知在其他目录可以访问:
/home/yu/crf/CRF+±0.58/crf_test -m model test.data >> tr.txt
以上测试后没有问题。
安装是否成功,测试一下:
1)用CRF++ -0.58 包里面的模板跑出模型 model
- 用跑出来的模型 model 去跑测试数据 test.data ,>> test.txt 意思是跑出的测试数据写入到 test.txt
- 查看跑出的数据test.txt 文件
2.关于训练中文数据遇到的问题
1.遇到用cat 打开文件显示乱码,首先把文件转换成utf-8 的无 BOM 的格式代码。
2.用model 跑测试数据出现,只训练最后一段话,前面都没啦,报这样错误,不太懂,但是换了测试文件后,就可以啦。
3.当要写入一个以当前时间为名的txt 文档,可以字符串之间组合,不用犹豫不决。代码如下:
result_file = datetime.now().strftime("%Y%m%d%H%M%S") + '_' +'.txt'
print result_file
输出:
- 20170528211827_.txt
3.代码:
#-*- coding:utf-8 -*-
import sys
import subprocess
from datetime import datetime,timedelta
import random ,string
reload(sys)
sys.setdefaultencoding('utf-8')
def runLinuxCommandOnPY(path_crf_test,path_model,path_test,path_output,path_result):
flag_success = 1
p = subprocess.Popen(path_crf_test+' -m '+path_model+' '+ path_test,
shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
a = list(string.ascii_letters)
random.shuffle(a)
path_write_test = path_output+''.join(a[:4])+'.txt'
f_trained = open(path_write_test,'w')
for line in p.stdout.readlines():
f_trained.write(line.strip()+'\n')
print line,
f_trained.close()
etval = p.wait()
result_file = datetime.now().strftime("%Y%m%d%H%M%S")+'_'+''.join(a[:2])+'.txt'
if etval == 0:
flag_success = 0
if flag_success ==0:
test_zhengwen = open(path_write_test, 'r')
s_zhengwen = test_zhengwen.readlines()
f = open(path_result+result_file, 'w')
str_t = ''
str_ns = ''
str_nr = ''
str_nt = ''
for i in s_zhengwen:
if (i.strip()):
m = i.strip().split('\t')
if m[2] == 'S':
f.write(m[0])
else:
if m[1] == 't':
if m[2] == 'B' or m[2] == 'M':
str_t += m[0]
else:
str_t += m[0]
st = '{{time:' + str_t + '}}'
f.write(st)
str_t = ''
elif m[1] == 'ns':
if m[2] == 'B' or m[2] == 'M':
str_ns += m[0]
else:
str_ns += m[0]
sns = '{{location:' + str_ns + '}}'
f.write(sns)
str_ns = ''
elif m[1] == 'nr':
if m[2] == 'B' or m[2] == 'M':
str_nr += m[0]
else:
str_nr += m[0]
snr = '{{person_name:' + str_nr + '}}'
f.write(snr)
str_nr = ''
elif m[1] == 'nt':
if m[2] == 'B' or m[2] == 'M':
str_nt += m[0]
else:
str_nt += m[0]
snt = '{{org_name:' + str_nt + '}}'
f.write(snt)
str_nt = ''
else:
f.write(m[0])
f.close()
subprocess.call("rm -rf "+path_write_test, shell=True)
return path_result+result_file
调用:
#-*- coding:utf-8 -*-
import sys
import executeLinuxCommand
reload(sys)
sys.setdefaultencoding('utf-8')
def test():
path_crf_test = '../CRF++-0.58/crf_test'
path_model ='../entityRecognition/ceShi/yu/model'
path_test = '../entityRecognition/ceShi/yu/gtest.data'
path_output ='../entityRecognition/ceShi/yu/over_file/'
path_result ='../entityRecognition/ceShi/yu/result/'
result_file_path = executeLinuxCommand.runLinuxCommandOnPY(path_crf_test,path_model,
path_test,path_output,path_result)
print result_file_path
if __name__ == '__main__':
test()
安装过程: