对cnews数据做一些数据处理
import jieba
import pandas as pd
import tensorflow as tf
from collections import Counter
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
# 读取停用词
def read_stopword(filename):
stopword = []
fp = open(filename, 'r')
for line in fp.readlines():
stopword.append(line.replace('\n', ''))
fp.close()
return stopword
# 切分数据,并删除停用词
def cut_data(data, stopword):
words = []
for content in data['content']:
word = list(jieba.cut(content))
for w in list(set(word) & set(stopword)):
while w in word:
word.remove(w)
words.append(word)
data['content'] = words
return data
# 获取单词列表
def word_list(data):
all_word = []
for word in data['content']:
all_word.extend(word)
return all_word
# 提取特征
def feature(train_data, test_data, val_data):
content = pd.concat([train_data['content'], test_data['content'], val_data['content']], ignore_index=True)
# count_vec = CountVectorizer(max_features=300, min_df=2)
# count_vec.fit_transform(content)
# train_fea = count_vec.transform(train_data['content']).toarray()
# test_fea = count_vec.transform(test_data['content']).toarray()
# val_fea = count_vec.transform(val_data['content']).toarray()
model = Word2Vec(content, size=100, min_count=1, window=10, iter=10)
train_fea = train_data['content'].apply(lambda x: model[x])
test_fea = test_data['content'].apply(lambda x: model[x])
val_fea = val_data['content'].apply(lambda x: model[x])
return train_fea, test_fea, val_fea
if __name__ == '__main__':
train_data = pd.read_csv('./data/task1/cnews/cnews.train.txt', names=['title', 'content'], sep='\t') # (50000, 2)
test_data = pd.read_csv('./data/task1/cnews/cnews.test.txt', names=['title', 'content'], sep='\t') # (10000, 2)
val_data = pd.read_csv('./data/task1/cnews/cnews.val.txt', names=['title', 'content'], sep='\t') # (5000, 2)
train_data = train_data.head(50)
test_data = test_data.head(50)
val_data = val_data.head(50)
stopword = read_stopword('./data/stopword.txt&#