from collections import Counter
defgetVocabularyText(content_list,size):
size = size -1
allContent =''.join(content_list)#将内容列表中的所有文章合并起来变成字符串str形式
counter = Counter(allContent)#将counter对象实例化并传入字符串形式的内容
vocabulary =[]
vocabulary.append('<PAD>')for i in counter.most_common(size):
vocabulary.append(i[0])withopen('vocabulary.txt','w',encoding='utf8')asfile:file.write(vocab +'\n')
2.读取数据
withopen('cnews.vocab.txt',encoding='utf8')asfile:
vocabulary_list =[k.strip()for k infile.readlines()]#读取词表withopen('cnews.train.txt',encoding='utf8')asfile:
line_list =[k.strip()for k infile.readlines()]#读取每行
train_label_list =[k.split()[0]for k in line_list]#将标签依次取出
train_content_list =[k.split(maxsplit =1)[1]for k in line_list]#将内容依次取出#同理读取test数据withopen('cnews.test.txt',encoding='utf8')asfile:
line_list =[k.strip()for k infile.readlines()]
test_label_list =[k.split()[0]for k in line_list]
test_content_list =[k.split(maxsplit =1)[1]for k in line_list]
import tensorflow.contrib.keras as kr
train_X = kr.preprocessing.sequence.pad_sequences(train_vector_list,600)
test_X = kr.preprocessing.sequence.pad_sequences(test_vector_list,600)
word2id_dict =dict(((b,a)for a ,b inenumerate(vocabulary_list)))defcontent2vector(content_list):
content_vector_list =[]for content in content_list:
content_vector =[]for word in content:if word in word2id_dict:
content_vector.append(word2id_dict[word])else:
content_vector.append(word2id_dict['<PAD>'])
content_vector_list.append(content_vector)return content_vector_list
train_vector_list = content2vector(train_content_list)
test_vector_list = content2vector(test_content_list)print(len(train_content_list[0]))print(len(train_vector_list[:1][0]))print('************************************')print(len(test_content_list[0]))print(len(test_vector_list[:1][0]))