import pandas as pd
import numpy as np
import random
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc
import matplotlib.pyplot as plt
# generate data
pages = ['page_{n}'.format(n=page) for page in range(20)]
page_data = pd.DataFrame(index=[i for i in range(10000)])
for i in pages:
page_data[i] = [random.randrange(0,2,1) for i in range(10000)]
# model
ag = AgglomerativeClustering(n_clusters=2,linkage='ward')
# fit data
page_data_n = page_data[:200]
ag.fit(page_data_n)
# plot dendrogram
plt.figure(figsize=(15, 10))
dend = shc.dendrogram(shc.linkage(page_data_n, method='ward'))
# use sklearn compute distance of paired children, node for linkage matrix for dendrogram
def plot_dendrogram(self, y_top, y_bot, **kwargs):
distance = np.arange(self.children_.shape[0])
position = np.arange(2, self.children_.shape[0]+2)
linkage_matrix = np.column_stack([self.children_, distance, position]).astype(float)
fig, ax = plt.subplots(figsize=(15, 10))
shc.dendrogram(linkage_matrix, **kwargs)
ax.set_ylim(y_bot,y_top)
plt.show()
plot_dendrogram(ag,y_bot=100, y_top=210, p=100,truncate_mode='lastp',color_threshold=180)
Agglomerative Clustering
最新推荐文章于 2024-03-22 00:53:55 发布