1.顺序遍历找出最长的词,依次递推,长度作为是否切分的评判标准
应用:分词,短语匹配
# coding=utf-8
import numpy as np
import pandas as pd
class IMM(object):
def __init__(self,vocab_txtpath):
self._vocabs=[]
with open(vocab_txtpath,'r') as fr:
for line in fr:
word_iter=line.strip('\r\n ')
self._vocabs.append(word_iter)
self._maximum=max([len(word_iter) for word_iter in self._vocabs])
def cut(self,sentence):
words_cur=[]
length=len(sentence)
index=length-1
while index>=0:
word = None
for i in range(self._maximum,0,-1):
word_piece_iter=sentence[index-i:index+1]
if word_piece_iter in self._vocabs:
word=word_piece_iter
words_cur.append(word_piece_iter)
index-=len(wor