主要介绍内容:
1. 文本分析:可视化
2. 文本关键词提取
3. 机器学习实现文本分类
4. FastText实现文本分类
5. 深度学习实现文本分类
一、文本可视化——WordCloud库
可以实现自定义图片形状的词云
'''
# 主要代码
'''
word_stat = word_df.groupby(by=['segment'])['segment'].agg({'计数':np.size})
word_stat = word_stat.reset_index().sort_values(by=['计数'], ascending=False)
bimg = imread('image/timg.jpg')
wordcloud = WordCloud(
font_path='data/simhei.ttf',
mask=bimg, background_color='white',
max_font_size=200)
word_frequence = {x[0]:x[1] for x in word_stat.head(100).values}
wordcloud = wordcloud.fit_words(word_frequence)
bimgColors = ImageColorGenerator(bimg)
plt.imshow(wordcloud.recolor(color_func=bimgColors))
plt.axis('off')
plt.show()
实现效果:
二、关键词提取
1. TF-IDF实现关键词提取
x = " ".join(analyse.extract_tags(contents, topK=30, withWeight=False, allowPOS=()))
2. TextRank实现关键词提取
x = " ".join(analyse.textrank(contents, topK=30, withWeight=False, allowPOS=('v', 'n')))
3. LDA实现关键词提取
dictionary = corpora.Dictionary(segs)
corpus = [dictionary.doc2bow(seg) for seg in segs]
lda = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=dictionary,num_topics=20)
for topic in lda.print_topics(num_topics=20, num_words=8):
print(topic[1])
三、机器学习实现文本分类
机器学习方法:朴素贝叶斯、SVM
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=1234)
class TextClassifier():
def __init__(self, classifier = MultinomialNB()):
self.classifier = classifier
# self.classifier = SVC(kernel='linear') # SVM
self.vectorizer = CountVectorizer(
analyzer='word',
ngram_range=(1,4),
max_features=20000)
def features(self,X):
return self.vectorizer.transform(X)
def fit(self,X, y):
self.vectorizer.fit(X)
self.classifier.fit(self.features(X),y)
def predict(self,x):
return self.classifier.predict(self.features([x]))
def score(self,X,y):
return self.classifier.score(self.features(X),y)
text_classifier = TextClassifier()
text_classifier.fit(x_train, y_train)
predict_class = text_classifier.predict('这 是 有史以来 最 大 的 一 次 军舰 演习')
print(predict_class)
print_score = text_classifier.score(x_test,y_test)
print("score:",print_score)
四、FastText实现文本分类
FastText的实验代码很简单,但在数据预处理时,需要得到的数据格式有点特殊:[文本, __label__class],其中__label__是自定义的
预处理代码如下:
def preprocess_text(content, stopword, segment, category):
for line in content:
try:
segs = jieba.lcut(line)
segs = list(filter(lambda x:len(x)>1, segs))
segs = list(filter(lambda x:x not in stopword, segs))
segment.append("__label__"+str(category)+", "+" ".join(segs))
except:
print(line)
continue
FastText模型的使用:
classifier = fasttext.supervised(
"news_fasttext_train.txt",
"news_fasttext.model",
label_prefix="__label__")
classifier = fasttext.load_model(
'news_fasttext.model.bin',
label_prefix='__label__')
result = classifier.test("news_fasttext_test.txt")
print(result.precision)
print(result.recall)
FastText有两种用途,上面是其中一种,有监督的训练,即文本分类;另外一种是无监督的训练,即词向量训练
# skipgram模型
# model = fasttext.skipgram('vecdata.txt', 'vecSkip.model')
# print(model.words)
# cbow模型
model = fasttext.cbow('vecdata.txt', 'vecCbow.model')
print(model.words)
print(model['职能'])
五、深度学习实现文本训练
1.词袋模型
def bag_of_words_models(features, target):
EMBEDDING_SIZE = 50
target = tf.one_hot(target, 5, 1, 0)
features = encoders.bow_encoder(features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE)
# 全连接层
logits = tf.contrib.layers.fully_connected(features, 5, activation_fn=None)
loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
train_op = tf.contrib.layers.optimize_loss(
loss,
tf.contrib.framework.get_global_step(),
optimizer='Adam',
learning_rate=0.01)
return ({'class':tf.argmax(logits, 1),
'prob':tf.nn.softmax(logits)
}, loss, train_op)
2. CNN模型
def cnn_model(features, target):
EMBEDDING_SIZE = 20
N_FILTERS = 10
WINDOW_SIZE = 20
FILTER_SHAPE1 = [WINDOW_SIZE, EMBEDDING_SIZE]
FILTER_SHAPE2 = [WINDOW_SIZE, N_FILTERS]
POOLING_WINDOW = 4
POOLING_STRIDE = 2
target = tf.one_hot(target, 5, 1, 0)
word_vectors = tf.contrib.layers.embed_sequence(
features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE, scope='words')
word_vectors = tf.expand_dims(word_vectors, 3)
with tf.variable_scope('CNN_Layer1'):
# 卷积层
conv1 = tf.contrib.layers.convolution2d(
word_vectors, N_FILTERS, FILTER_SHAPE1, padding='VALID')
# rule
conv1 = tf.nn.relu(conv1)
# 最大池化层
pool1 = tf.nn.max_pool(
conv1,
ksize=[1, POOLING_WINDOW, 1, 1],
strides=[1, POOLING_STRIDE, 1, 1],
padding='SAME')
# 对矩阵转置
pool1 = tf.transpose(pool1, [0, 1, 3, 2])
with tf.variable_scope('CNN_Layer2'):
# 卷积层
conv2 = tf.contrib.layers.convolution2d(
pool1,N_FILTERS, FILTER_SHAPE2, padding='VALID')
# 抽取特征
pool2 = tf.squeeze(tf.reduce_max(conv2, 1), squeeze_dims=[1])
# 全连接层
logits = tf.contrib.layers.fully_connected(pool2, 5, activation_fn=None)
loss = tf.losses.softmax_cross_entropy(target, logits)
train_op = tf.contrib.layers.optimize_loss(
loss,
tf.contrib.framework.get_global_step(),
optimizer='Adam',
learning_rate=0.01)
return ({'class':tf.argmax(logits, 1),
'prob':tf.nn.softmax(logits)
}, loss, train_op)
# 构建模型
classifier = learn.SKCompat(learn.Estimator(model_fn=cnn_model))
# 训练与预测
classifier.fit(train_data,train_target)
y_predicted = classifier.predict(test_data)['class']
print(y_predicted[0])
score = metrics.accuracy_score(test_target, y_predicted)
print("score:", score)
3. GRU模型
def run_gru_model(features, target):
EMBEDDING_SIZE = 50
target = tf.one_hot(target, 5, 1, 0)
word_vectors = tf.contrib.layers.embed_sequence(
features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE, scope='words')
# 将词向量平铺
word_list = tf.unstack(word_vectors, axis=1)
# 定义GRU神经细胞
cell = tf.contrib.rnn.GRUCell(EMBEDDING_SIZE)
_, encoding = tf.contrib.rnn.static_rnn(cell, word_list, dtype=tf.float32)
logits = tf.contrib.layers.full_connected(encoding, 5, activation_fn=None)
loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
train_op = tf.contrib.layers.optimize_loss(
loss,
tf.contrib.framework.get_global_step(),
optimizer='Adam',
learning_rate=0.01)
return ({'class': tf.argmax(logits, 1),
'prob': tf.nn.softmax(logits)
}, loss, train_op)
# 构建模型
classifier = learn.SKCompat(learn.Estimator(model_fn=run_gru_model))
# 训练及预测模型
classifier.fit(train_data, train_target)
y_predicted = classifier.predict(test_data)['class']
print(y_predicted[0])
score = metrics.accuracy_score(test_target, y_predicted)
print("score:", score)