1、去除txt文件中的单双引号
2、tab键将三元组键分隔开,每个三元组为一行,提取出第一个实体和最后一个实体,并将txt中所有三元组的第一个实体存入E:\xx\baike_chouqu_star_clean.txt,最后一个实体存入E:\xx\baike_chouqu_end_clean.txt
import datetime
import re
start_time = datetime.datetime.now()
print("start time:", start_time)
count = 1
with open(r'E:\xx\baike_triples.txt', encoding='utf-8',mode='r')as f:
with open(r'E:\xx\baike_chouqu_star_clean.txt', encoding='utf-8', mode='w')as wt:
with open(r'E:\xx\baike_chouqu_end_clean.txt', encoding='utf-8', mode='w')as wt1:
for data in f:
if count == 100: #用于测试,由于数据量较大,只执行100行数据,之后可以注释掉
break
count += 1
result = re.sub('"|“|”|</ a>|<a>|★|\'',' ',data)#去除特殊