数据集较为庞大 ,这里只做一个示例
import codecs
import re
import json
with open('train_pub.json','rb') as f:
datatrain = json.load(f)
# 数据预处理
# 预处理名字
def precessname(name):
name = name.lower().replace(' ', '_')
name = name.replace('.', '_')
name = name.replace('-', '')
name = re.sub(r"_{2,}", "_", name)
return name
# 预处理机构,简写替换,
def preprocessorg(org):
if org != "":
org = org.replace('Sch.', 'School')
org = org.replace('Dept.', 'Department')
org = org.replace('Coll.', 'College')
org = org.replace('Inst.', 'Institute')
org = org.replace('Univ.', 'University')
org = org.replace('Lab ', 'Laboratory ')
org = org.replace('Lab.', 'Laboratory'