1、初始化优化
使用k-means得到均值u,用u来初始化EM算法的均值,权重和方差。
2、初始化优化代码实现
k-means初始化mean
from sklearn.cluster import KMeans
np.random.seed(5)
num_clusters = 25
# Use scikit-learn's k-means to simplify workflow
#kmeans_model = KMeans(n_clusters=num_clusters, n_init=5, max_iter=400, random_state=1, n_jobs=-1) # uncomment to use parallelism -- may break on your installation
kmeans_model = KMeans(n_clusters=num_clusters, n_init=5, max_iter=400, random_state=1, n_jobs=1)
kmeans_model.fit(tf_idf)
centroids, cluster_assignment = kmeans_model.cluster_centers_, kmeans_model.labels_
means = [centroid for centroid in centroids]
根据mean初始化weights:
num_docs = tf_idf.shape[0]
weights = []
for i in xrange(num_clusters):
# Compute the number of data points assigned to cluster i:
num_assigned = sum(cluster_assignment == i)
w = float(num_assigned) / num_docs
weights.append(w)
根据mean初始化cov
covs = []
for i in xrange(num_clusters):
member_rows = tf_idf[cluster_assignment==i]
cov = (member_rows.multiply(member_rows) - 2*member_rows.dot(diag(means[i]))).sum(axis=0).A1 / member_rows.shape[0] \
+ means[i]**2
cov[cov < 1e-8] = 1e-8
covs.append(cov)
运行EM:
out = EM_for_high_dimension(tf_idf, means, covs, weights, cov_smoothing=1e-10)
3、初始化随机代码实现
np.random.seed(5) # See the note below to see why we set seed=5.
num_clusters = len(means)
num_docs, num_words = tf_idf.shape
random_means = []
random_covs = []
random_weights = []
for k in range(num_clusters):
# Create a numpy array of length num_words with random normally distributed values.
# Use the standard univariate normal distribution (mean 0, variance 1).
# YOUR CODE HERE
mean = np.random.random(num_words)
# Create a numpy array of length num_words with random values uniformly distributed between 1 and 5.
# YOUR CODE HERE
cov = np.random.uniform(1, 5, num_words)
# Initially give each cluster equal weight.
# YOUR CODE HERE
#weight = float(weight)
weight = 1/float(num_clusters)
random_means.append(mean)
random_covs.append(cov)
random_weights.append(weight)
运行EM:
out_random_init = EM_for_high_dimension(tf_idf, random_means, random_covs, random_weights, cov_smoothing=1e-5)