MiniBatchKMeans比KMeans快很多,效果也不错,应用于文本聚类如下:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import logging
import os
import re
from collections import defaultdict
from time import time
import jieba
from gensim.utils import to_utf8
from six.moves import xrange
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)
def load_stopwords():
# path = '/Users/fhqplzj/github/HanLP/data/dictionary/stopwords.txt'
path = '/data/zhaojun/local_projects/stopwords.txt'
return frozenset(open(path, 'rb').read().decode('utf-8').splitlines())
# 停用词
stopwords = load_stopwords()
chinese = re.compile(ur'^[\u4e00-\u9fa5]+$')
def chinese_non_stopwords(word):
# 全是中文,并且不是停用词
result = True if re.match(chinese, word) else False
return result and word not in stopwords
def sentence_tokenizer(sentence):
# 分词,过滤
try:
content = sentence.strip().split('\t', 1)[1]
except IndexError:
content = u'呢'
return filter(chinese_non_stopwords, jieba.lcut(content))
def load_documents(path):
# path = '/Users/fhqplzj/Downloads/part-' + path
path = '/data/zhaojun/part100/part-' + path
logger.info('processing file: %s' % path)
return open(path, 'rb').read().decode('utf-8').splitlines()
if __name__ == '__main__':
file_names = map(lambda i: '{:05d}'.format(i), xrange(100))
docs = []
for file_name in file_names:
docs.extend(load_documents(file_name))
t0 = time()
logger.info('TfidfVectorizer...')
vectorizer = TfidfVectorizer(tokenizer=sentence_tokenizer, min_df=5, max_df=0.1)
X = vectorizer.fit_transform(docs)
logger.info('vectorizer: %fs' % (time() - t0))
t0 = time()
logger.info('MiniBatchKMeans...')
km = MiniBatchKMeans(n_clusters=100, batch_size=1000)
km.fit(X)
logger.info('kmeans: %fs' % (time() - t0))
t0 = time()
logger.info('collecting result')
pred_labels = km.labels_
result = defaultdict(list)
for idx in xrange(len(pred_labels)):
result[pred_labels[idx]].append(docs[idx])
for k in result:
name = 'res-{:05d}'.format(k)
elems = result[k]
out_path = os.path.join('/tmp/cluster', name)
with open(out_path, 'w') as fout:
logger.info('writing %s' % out_path)
for elem in elems:
fout.write(to_utf8(elem) + '\n')
logger.info('finished: %fs' % (time() - t0))
# sorted_indices = km.cluster_centers_.argsort()[:, ::-1]
# id2words = vectorizer.get_feature_names()
# for i in range(km.n_clusters):
# print('cluster: %i' % i)
# for idx in sorted_indices[i, :10]:
# print(' %s' % id2words[idx])
# print()