朴素贝叶斯分类器是基于规则库的传统统计模型
一,首先应导入词典作为规则库使用
#链接mdb文件
p_path = 'D:\Lexicon_full_2000.mdb'
connStr = 'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+p_path+';PWD=007'
conn = mdb.win_connect_mdb(connStr)
#创建游标
cur = conn.cursor()
#取表word
cur.execute('SELECT * FROM words;')
#取word中的所有数据
allword = cur.fetchall()
conn.close()
allword = pd.DataFrame(allword,columns = ['词语序列','ciyu','出现次数'])
allword = allword['ciyu'].tolist()
二,建立朴素贝叶斯分类器
#后向算法分类器
def back_longest(text,dict):
word_list = []
i = len(text)-1
while(i>0):
if(i<8):
j = 0
else:
j = i-8
longest_word = text[i]
for j in range(j,i):
word = text[j:i+1]
if word in dict:
longest_word = word
word_list.insert(0,longest_word)
i-=len(longest_word)
return word_list
#前向算法分类器
def forward_longest(text,dict):
word_list = []
i = 1
while(i<len(text)):
if(i>len(text)-8):
j = len(text)
else:
j = i+8
longest_word = text[i-1]
for j in range(i,j):
word = text[i-1:j]
if word in dict:
longest_word = word
word_list.append(longest_word)
i+=len(longest_word)
return word_list;
三,导入需要分词的文本
#读入文本
with open('D:/fenci.txt',encoding = "utf-8") as f:
text = f.read()
四,调用分类器函数,开始分词
print ('后向')
print(back_longest(text,allword))
print('前向')
print(forward_longest(text,allword))
最后我们能够看出后向算法总体上效果要好于前向算法,但是分词错误仍然较多。我们可以在算法上做一些改进,添加一些算法因子将前向和后向算法混合起来形成混合算法,这样效果会更好一些,在此不做赘述。
下一篇将讲解在分词方面效果相对更好的隐马尔科夫模型。
完整代码:
import pypyodbc as mdb
import pandas as pd
import time
#链接mdb文件
p_path = 'D:\Lexicon_full_2000.mdb'
connStr = 'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+p_path+';PWD=007'
conn = mdb.win_connect_mdb(connStr)
#创建游标
cur = conn.cursor()
#取表word
cur.execute('SELECT * FROM words;')
#取word中的所有数据
allword = cur.fetchall()
conn.close()
allword = pd.DataFrame(allword,columns = ['词语序列','ciyu','出现次数'])
allword = allword['ciyu'].tolist()
#读入文本
with open('D:/fenci.txt',encoding = "utf-8") as f:
text = f.read()
def back_longest(text,dict):
word_list = []
i = len(text)-1
while(i>0):
if(i<8):
j = 0
else:
j = i-8
longest_word = text[i]
for j in range(j,i):
word = text[j:i+1]
if word in dict:
longest_word = word
word_list.insert(0,longest_word)
i-=len(longest_word)
return word_list
def forward_longest(text,dict):
word_list = []
i = 1
while(i<len(text)):
if(i>len(text)-8):
j = len(text)
else:
j = i+8
longest_word = text[i-1]
for j in range(i,j):
word = text[i-1:j]
if word in dict:
longest_word = word
word_list.append(longest_word)
i+=len(longest_word)
return word_list;
print ('后向')
print(back_longest(text,allword))
print('前向')
print(forward_longest(text,allword))