5-2. 利用汉语切分和标注语料(注意版权的合法性),尝试用bi-gram 实现一个简单的汉语自动分词程序。
#-------------------------------------------------------------------------------
# Name: n_gram切分中文
# Purpose: 自然语言处理第5章作业
# 水平有限,仅做参考
# Author: nkenen
#
# Created: 22/02/2020
# Copyright: (c) Administrator 2020
# Licence: <your licence>
#-------------------------------------------------------------------------------
import re
symbol = ',.!?。,?!0123456789qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM'
#本程序并不是有用的只是将已标注好的1998语料库给转变成无标注的
def Makenomarkedcorpus():
file = open('F:/自然语言处理/1980pd.txt','w',encoding='utf-8')
filer = open('F:/自然语言处理/199801_people_s_daily.txt','r',encoding='utf-8',errors='ignore')
for line in filer:
str = ''
flag = 0
i=0
while i < len(line):
if line[i] == '/':
while line[i] != ' ' :
i += 1
if i >= len(line):
break
elif line[i] == ']':
str += ']'
flag = 1
elif flag == 1:
str += line[i]
i += 1
file.write(str+'\n')
file.close()