from keras.preprocessing import text
#facts, accu_label, article_label, imprison_label=load_data()
somestr = ['ha ha gua angry','howa ha gua excited naive']
tok=text.Tokenizer() #初始化标注器
tok.fit_on_texts(somestr) #学习出文本的字典
word_index = tok.word_index#查看对应的单词和数字的映射关系dict
print(word_index)
sequences = tok.texts_to_sequences(somestr) #通过texts_to_sequences 这个dict可以将每个string的每个词转成数字
print(sequences)
{‘naive’: 6, ‘ha’: 1, ‘excited’: 5, ‘angry’: 3, ‘gua’: 2, ‘howa’: 4}
[[1, 1, 2, 3], [4, 1, 2, 5, 6]]
转换成词袋序列
maxlen = 10
from keras.preprocessing import sequence
x = sequence.pad_sequences(sequences, maxlen,dtype='int16') # 将每条文本的长度设置一个固定值。
print(x)
[[0 0 0 0 0 0 1 1 2 3]
[0 0 0 0 0 4 1 2 5 6]]
import numpy as np
lenofdata = len(x)
x_train = x[np.arange(len(x))][:int(lenofdata * 0.8)]
print(x_train)
[[0 0 0 0 0 0 1 1 2 3]]
np.vstack((x, x_train))
array([[0, 0, 0, 0, 0, 0, 1, 1, 2, 3],
[