核心问题:
h5py写入str不支持unicode编码,可以先将其转为utf-8,再写入
sents1 = [tmp.encode('utf8') for tmp in sents1]
synts1 = [tmp.encode('utf8') for tmp in synts1]
代码:
def parse(input_file1,input_file2,output_file):
nlp = StanfordCoreNLP(r'D:\\stanford_nlp')
lines = open(input_file1, "r", encoding="utf-8").readlines()
f = h5py.File(output_file, 'w')
sents1 = []
synts1 = []
dtype = h5py.special_dtype(vlen=str)
for line in lines:
sents1.append(line.strip('\n'))
result = " ".join(nlp.parse(line).split())
synts1.append(result)
sents1 = [tmp.encode('utf8') for tmp in sents1]
synts1 = [tmp.encode('utf8') for tmp in synts1]
f.create_dataset("sents1",dtype=dtype, data=sents1)
f.create_dataset("synts1",dtype=dtype, data=synts1)
lines = open(input_file2, "r", encoding="utf-8").readlines()
sents2 = []
synts2 = []
for line in lines:
sents2.append(line.strip('\n'))
result = " ".join(nlp.parse(line).split())
synts2.append(result)
sents2 = [tmp.encode('utf8') for tmp in sents2]
synts2 = [tmp.encode('utf8') for tmp in synts2]
f.create_dataset("sents2",dtype=dtype, data=sents2)
f.create_dataset("synts2",dtype=dtype, data=synts2)
f.close()