自然语言处理自学笔记-03 使用Word2vec进行文档分类
使用Word2vec进行文档分类
Word2vec提供了一种单词数字表示的方法。词嵌入被用作许多任务的单词特征表示,比如图像标题生成和机器翻译。文档分类任务是词嵌入实际应用中的很简单的一个应用。在文档分类中,需要通过词向量技术嵌入整个文档而不是单词。
用词向量进行文档分类
在小数据集上,CBOW算法要好于skip-gram。在这里我们使用CBOW算法来进行文档分类。过程如下:
- 从所有文本文件中提取数据并学习词嵌入。
- 从已经训练过的文档中提取一组随机文档。
- 用文档中找到的词嵌入向量的平均值来表示文档。
- 使用t-SNE可视化技术可视化找到文档嵌入。
- 使用K-means聚类算法进行聚类分析。
代码实现
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import nltk # standard preprocessing
import operator # sorting items in dictionary by value
#nltk.download() #tokenizers/punkt/PY3/english.pickle
from math import ceil
filename = 'bbc-fulltext.zip'
def read_data(filename):
"""
Extract articles up to a given threshold in a zip file as a list of words
and pre-processes it using the nltk python library
"""
data = []
files_to_read_for_topic = 250
topics = ['business', 'entertainment', 'politics', 'sport', 'tech']
with zipfile.ZipFile(filename) as z:
parent_dir = z.namelist()[0]
for t in topics:
print('\tFinished reading data for topic: ', t)
for fi in range(1, files_to_read_for_topic):
with z.open(parent_dir + t + '/' + format(fi, '03d') + '.txt') as f:
file_string = f.read().decode('latin-1')
file_string = file_string.lower()
file_string = nltk.word_tokenize(file_string)
data.extend(file_string)
return data
def read_test_data(filename):
"""
Extract articles up to a given threshold in a zip file as a list of words
and pre-processes it using the nltk python library
"""
test_data = {
}
files_to_read_for_topic = 250
topics = ['business', 'entertainment', 'politics', 'sport', 'tech']
with zipfile.ZipFile(filename) as z:
parent_dir = z.namelist()[0]
for t in topics:
print('\tFinished reading data for topic: ', t)
for fi in np.random.randint(1, files_to_read_for_topic, (10)).tolist():
with z.open(parent_dir + t + '/' + format(fi, '03d') + '.txt') as f:
file_string = f.read().decode('latin-1')
file_string = file_string.lower()
file_string = nltk.word_tokenize(file_string)
test_data[t + '-' + str(fi)] = file_string
return test_data
print('Processing training data...')
words = read_data(filename)
print('\nProcessing testing data...')
test_words = read_test_data(filename)
print('Example words (start): ', words[:10])
print('Example words (end): ', words[-10:])
vocabulary_size = 25000
def build_dataset(words):
# Allocate a special token for rare words
count = [['UNK', -1]]
# Gets only the vocabulary_size most common words as the vocabulary
# All the other words will be replaced with UNK token
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
# Create an ID for each word by giving the current length of the dictionary
# And adding that item to the dictionary
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
# Traverse through all the text we have and produce a list
# where each element corresponds to the ID of the word found at that index
for word in words:
# If word is in the dictionary use the word ID,
# else use the ID of the special token "UNK"
if word in dictionary:
index