1.字典定义及转换
2.训练字典
import numpy as np
class WordSequence(object):
PAD_TAG='<pad>'
UNK_TAG='<unk>'
START_TAG='<s>'
END_TAG='</s>'
PAD=0
UNK=1
START=2
END=3
def __init__(self):
#初始化基本的字典dict
self.dict={
WordSequence.PAD_TAG: WordSequence.PAD,
WordSequence.UNK_TAG: WordSequence.UNK,
WordSequence.START_TAG: WordSequence.START,
WordSequence.END_TAG: WordSequence.END
}
self.fitted=False
def to_index(self, word):
assert self.fitted, 'WordSequence尚未进行fit操作'
if word in self.dict:
return self.dict[word]
return WordSequence.UNK
def to_word(self,index):
assert self.fitted, 'WordSequence尚未进行fit操作'
for k,v in self.dict.items():
if v==index:
return k
return WordSequence.UNK
def size(self):
assert self.fitted, 'WordSequence尚未进行fit操作'
return len(self.dict)+1
def __len__(self):
return self.size()
def fit(self, sentences, min_count=5, max_count=None, max_features=None):
assert not self.fitted, 'WordSequence 只能fit一次'
count={}
for sentences in sentences:
arr=list(sentences)
for a in arr:
if a not in count:
count[a]=0
count[a]+=1
if min_count is not None:
count={k:v for k,v in count.items() if v>=min_count}
if max_count is not None:
count = {k: v for k, v in count.items() if v<=max_count}
self.dict={
WordSequence.PAD_TAG: WordSequence.PAD,
WordSequence.UNK_TAG: WordSequence.UNK,
WordSequence.START_TAG: WordSequence.START,
WordSequence.END_TAG: WordSequence.END
}
if isinstance(max_features, int):
count = sorted(list(count.items()), key=lambda x:x[1])
if max_features is not None and len(count) > max_features:
count = count[-int(max_features):]
for w,_ in count:
self.dict[w] = len(self.dict)
else:
for w in sorted(count.keys()):
self.dict[w] = len(self.dict)
self.fitted=True