with open('./cnews/cnews.train.txt', encoding='utf8') as file:
label_list = [k.strip() for k in file.readlines()] #去两边的空格
train_label_list = [k.split()[0] for k in label_list] #标签
train_content_list = [k.split(maxsplit=1)[1] for k in label_list] #内容
with open('./cnews/cnews.vocab.txt', encoding='utf8') as file:
vocabulary_list = [k.strip() for k in file.readlines()]
word2id_dict = dict([(b, a) for a, b in enumerate(vocabulary_list)]) #key是word , value是 数字 ,将列表转化为字典
content2idList = lambda content : [word2id_dict[word] for word in content if word in word2id_dict] #一个函数,将文章中的每个字转换成id,返回一个数字的list
train_idlist_list = [content2idList(content) for content in train_content_list] #每个元素代表一行content,是一个数字的list
label2id_dict = dict([(b, a) for a, b in enumerate(set(train_label_list))])
train_labelid_list = [label2id_dict[label] for label in train_label_list]
数据处理,数字化,向量化,建词典