def build_dataset(words):
count = [['UNK', -1]]
#collections.Counter(words).most_common
count.extend(collections.Counter(words).most_common(vocabulary_size - 1)) # words中每个分词计数,然后按照词频降序排列放在count里:[['UNK', -1], ('的', 99229), ('在', 25925), ('是', 20172), ('年', 17007), ('和', 16514), ('为', 15231), ('了', 13053), ('有', 11253), ('与', 11194)]
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary) # count中每个词分配一个编号,:[('UNK', 0), ('的', 1), ('在', 2), ('是', 3), ('年', 4), ('和', 5), ('为', 6), ('了', 7), ('有', 8), ('与', 9)]
# 相当于词典,key是分词,value是分配的编号
data = list()
unk_count = 0
data=[dictionary[word] if word in dictionary else 0 for word in words] # 将words中的每个分词用序列号表示:[14880, 4491, 483, 70, 1, 1009, 1850, 317, 14, 76]
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) # 将dictionary中的key和value对换:[(0, 'UNK'), (1, '的'), (2, '在'), (3, '是'), (4, '年'), (5, '和'), (6, '为'), (7, '了'), (8, '有'), (9, '与')]
# 相当于key是编号,value是对应的词
return data, count, dictionary, reverse_dictionary