目录
2.2 对于每个类别选择合适的化学成分对其进行亚类划分,给出具体的划分方法及划分结果,并对分类结果的合理性和敏感性进行分析。
import numpy as np
d12_K = d12.iloc[np.where((d12['类型'] == '高钾'))[0],6:20]
d12_Pb = d12.iloc[np.where((d12['类型'] == '铅钡'))[0],6:20]
print(d12_K.shape)
print(d12_Pb.shape)
(18, 14)
(49, 14)
铅钡玻璃
# Min-Max Normalization
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
d12_Pb_norm = pd.DataFrame(min_max_scaler.fit_transform(d12_Pb),
columns=list(d12_Pb.columns))
无监督变量筛选(聚类前筛选化学成分)
方差过滤法
https://scikit-learn.org/stable/modules/feature_selection.html
from sklearn.feature_selection import VarianceThreshold
def variance_threshold_selector(data, threshold=0.5):
selector = VarianceThreshold(threshold)
selector.fit(data)
return data[data.columns[selector.get_support(indices=True)]]
d12_Pb_norm_var = variance_threshold_selector(d12_Pb_norm, 0.01)
print(d12_Pb_norm_var.shape)
d12_Pb_norm_var.head()
(49, 14)
二氧化硅(SiO2) | 氧化钠(Na2O) | 氧化钾(K2O) | 氧化钙(CaO) | 氧化镁(MgO) | 氧化铝(Al2O3) | 氧化铁(Fe2O3) | 氧化铜(CuO) | 氧化铅(PbO) | 氧化钡(BaO) | 五氧化二磷(P2O5) | 氧化锶(SrO) | 氧化锡(SnO2) | 二氧化硫(SO2) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.453545 | 0.0 | 0.744681 | 0.365625 | 0.432234 | 0.380130 | 0.405229 | 0.024598 | 0.626006 | 0.000000 | 0.252654 | 0.169643 | 0.0 | 0.000000 |
1 | 0.228723 | 0.0 | 0.000000 | 0.231250 | 0.000000 | 0.064075 | 0.000000 | 0.984863 | 0.318174 | 0.880959 | 0.254069 | 0.330357 | 0.0 | 0.161755 |
2 | 0.012397 | 0.0 | 0.000000 | 0.498437 | 0.000000 | 0.047516 | 0.000000 | 0.297067 | 0.380069 | 0.863752 | 0.535032 | 0.473214 | 0.0 | 0.942320 |
3 | 0.416075 | 0.0 | 0.148936 | 0.548437 | 0.260073 | 0.161267 | 0.000000 | 0.466414 | 0.264160 | 0.412130 | 0.663836 | 0.330357 | 0.0 | 0.000000 |
4 | 0.361053 | 0.0 | 0.000000 | 0.457813 | 0.216117 | 0.224622 | 0.289760 | 0.332072 | 0.550320 | 0.150917 | 0.624912 | 0.169643 | 0.0 | 0.000000 |
拉普拉斯分数
Laplacian Score 是一个对一个训练集样本的特征进行打分的算法。通过这个算法可以给每一个特征打出一个分数,最后再取分数最低的k个特征作为最后选择的特征子集,是标准的 Filter 式方法。
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform
def cal_lap_score(features, D, L):
features_ = features - np.sum((features @ D) / np.sum(D))
L_score = (features_ @ L @ features_) / (features_ @ D @ features_)
return L_score
def get_k_nearest(dist, k, sample_index):
# dist is zero means it is the sample itself
return sorted(
range(len(dist)),
key=lambda i: dist[i] if i != sample_index else np.inf
)[:k] + [sample_index]
def laplacian_score(df_arr, label=None, **kwargs):
kwargs.setdefault("k_nearest", 5)
'''
Construct distance matrix, dist_matrix, using euclidean distance
'''
distances = pdist(df_arr, metric='euclidean')
dist_matrix = squareform(distances)
del distances
'''
Determine the edge of each sample pairs by k nearest neighbor
'''
edge_sparse_matrix = pd.DataFrame(
np.zeros((df_arr.shape[0], df_arr.shape[0])),
dtype=int
)
if label is None:
for row_id, row in enumerate(dist_matrix):
k_nearest_id = get_k_nearest(row, kwargs["k_nearest"], row_id)
edge_sparse_matrix.iloc[k_nearest_id, k_nearest_id] = 1
else:
label = np.array(label)
unique_label = np.unique(label)
for i in unique_label:
group_index = np.where(label == i)[0]
edge_sparse_matrix.iloc[group_index, group_index] = 1
S = dist_matrix * edge_sparse_matrix
del dist_matrix, edge_sparse_matrix
'''
Calculate the Laplacian graph L
'''
D = np.diag(S.sum(axis=1))
L = D - S
del S
'''
Minimize the Laplacian score
'''
features_lap_score = np.apply_along_axis(
func1d=lambda f: cal_lap_score(f, D, L), axis=0, arr=df_arr
)
return features_lap_score
data = {'化学成分': list(d12_Pb.columns),
'拉普拉斯分数': list(laplacian_score(np.array(d12_Pb_norm), k_nearest=3))}
d12_Pb_norm_lap = pd.DataFrame(data)
d12_Pb_norm_lap = d12_Pb_norm_lap.sort_values('拉普拉斯分数')
d12_Pb_norm_lap
化学成分 | 拉普拉斯分数 | |
---|---|---|
0 | 二氧化硅(SiO2) | 0.287870 |
8 | 氧化铅(PbO) | 0.304957 |
9 | 氧化钡(BaO) | 0.395260 |
1 | 氧化钠(Na2O) | 0.397298 |
10 | 五氧化二磷(P2O5) | 0.428794 |
4 | 氧化镁(MgO) | 0.461562 |
5 | 氧化铝(Al2O3) | 0.473288 |
3 | 氧化钙(CaO) | 0.485366 |
6 | 氧化铁(Fe2O3) | 0.486807 |
7 | 氧化铜(CuO) | 0.569931 |
11 | 氧化锶(SrO) | 0.613548 |
13 | 二氧化硫(SO2) | 0.789059 |
2 | 氧化钾(K2O) | 0.828154 |
12 | 氧化锡(SnO2) | 0.937966 |
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
# Linux show Chinese characters *** important
plt.rcParams['font.family'] = 'WenQuanYi Micro Hei'
fig = px.bar(d12_Pb_norm_lap, y="化学成分", x="拉普拉斯分数",
title="各化学成分的拉普拉斯分数", orientation='h')
# center title
fig.update_layout(title_x=0.5)
# remove background color
fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})
fig.show()
聚类
https://machinelearningmastery.com/clustering-algorithms-with-python/
K-means聚类
from sklearn.cluster import KMeans
fig = plt.figure(figsize=(10,8))
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
inertias = []
for i in range(1,11):
kmeans = KMeans(n_clusters=i)
kmeans.fit(d12_Pb)
inertias.append(kmeans.inertia_)
plt.plot(range(1,11), inertias, marker='o', linewidth=3)
plt.title('Elbow (铅钡玻璃)')
plt.xlabel('聚类数量')
plt.ylabel('Inertia')
plt.rc('font', size=20)
plt.rc('figure', titlesize=20)
plt.show()
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(d12_Pb)
km_labels = kmeans.labels_
km_labels
array([2, 0, 0, 0, 2, 0, 1, 0, 1, 0, 0, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 1,
2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 2,
2, 1, 2, 2, 2], dtype=int32)
pd.DataFrame(kmeans.cluster_centers_)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 21.592857 | 0.000000 | 0.188571 | 1.871429 | 0.101429 | 2.008571 | 0.215714 | 6.555714 | 26.344286 | 27.705714 | 5.084286 | 0.464286 | 6.938894e-18 | 5.074286 |
1 | 58.496500 | 1.975000 | 0.161500 | 1.199500 | 0.739000 | 5.044500 | 0.580000 | 0.984500 | 20.413000 | 7.196500 | 0.896500 | 0.234000 | 7.700000e-02 | 0.183000 |
2 | 26.537273 | 0.219091 | 0.179545 | 2.880000 | 0.733636 | 2.943636 | 0.865000 | 1.205455 | 47.338182 | 8.007273 | 4.900909 | 0.414545 | 5.954545e-02 | 0.000000 |
聚类模型评估指标
https://www.geeksforgeeks.org/clustering-metrics/
def clustering_evaluation(data, label):
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import mutual_info_score, adjusted_rand_score
# Calculate clustering metrics
silhouette = silhouette_score(data, label)
db_index = davies_bouldin_score(data, label)
ch_index = calinski_harabasz_score(data, label)
# Print the metric scores
print(f"Silhouette Score: {silhouette:.2f}")
print(f"Davies-Bouldin Index: {db_index:.2f}")
print(f"Calinski-Harabasz Index: {ch_index:.2f}")
clustering_evaluation(d12_Pb, km_labels)
Silhouette Score: 0.51
Davies-Bouldin Index: 0.85
Calinski-Harabasz Index: 63.26
使用UMAP方法对聚类结果进行可视化
def clustering_plot(data, label, title):
import umap.umap_ as umap
reducer = umap.UMAP(random_state=42)
embedding = reducer.fit_transform(data)
plt.scatter(
embedding[:, 0],
embedding[:, 1],
c=label)
plt.title(title, fontsize=16);
clustering_plot(d12_Pb, km_labels, 'K-Means聚类结果在UMAP上的可视化')
层次聚类
from scipy.cluster.hierarchy import dendrogram, linkage
import numpy as np
import matplotlib.pyplot as plt
linkage_data = linkage(d12_Pb, method='ward', metric='euclidean')
dendrogram(linkage_data)
plt.show()
from sklearn.cluster import AgglomerativeClustering
hierarchical_cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
hc_labels = hierarchical_cluster.fit_predict(d12_Pb)
hc_labels
array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
0, 1, 0, 0, 0])
clustering_evaluation(d12_Pb, hc_labels)
clustering_plot(d12_Pb, hc_labels, '层次聚类结果在UMAP上的可视化')
Silhouette Score: 0.50
Davies-Bouldin Index: 0.70
Calinski-Harabasz Index: 66.81
高斯混合模型 GMM
from sklearn.mixture import GaussianMixture
model = GaussianMixture(n_components=3)
# fit the model
model.fit(d12_Pb)
gmm_labels = model.predict(d12_Pb)
print('clustering labels: ')
gmm_labels
clustering labels:
array([1, 2, 2, 1, 1, 0, 0, 2, 0, 2, 2, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
1, 0, 1, 1, 1])
clustering_evaluation(d12_Pb, gmm_labels)
clustering_plot(d12_Pb, gmm_labels, 'GMM聚类结果在UMAP上的可视化')
Silhouette Score: 0.51
Davies-Bouldin Index: 0.75
Calinski-Harabasz Index: 61.87
Leiden莱顿聚类
from leiden_clustering import LeidenClustering
import numpy as np
clustering = LeidenClustering()
clustering.fit(d12_Pb)
leiden_labels = clustering.labels_
leiden_labels
array([0, 2, 2, 2, 0, 2, 1, 2, 1, 2, 2, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
0, 1, 0, 0, 0])
clustering_evaluation(d12_Pb, leiden_labels)
clustering_plot(d12_Pb, leiden_labels, 'Leiden聚类结果在UMAP上的可视化')
Silhouette Score: 0.51
Davies-Bouldin Index: 0.85
Calinski-Harabasz Index: 63.26
Affinity Propagation 亲和力传播
from sklearn.cluster import AffinityPropagation
# define the model
ap = AffinityPropagation()
# fit the model
ap.fit(d12_Pb)
# assign a cluster to each example
ap_labels = ap.predict(d12_Pb)
ap_labels
array([1, 2, 0, 2, 1, 2, 4, 2, 4, 2, 0, 3, 3, 1, 1, 3, 3, 3, 1, 3, 1, 3,
1, 5, 5, 1, 4, 4, 5, 1, 3, 3, 4, 4, 4, 1, 4, 5, 4, 1, 5, 1, 3, 5,
5, 4, 1, 5, 1])
clustering_evaluation(d12_Pb, ap_labels)
clustering_plot(d12_Pb, ap_labels, 'Affinity Propagation聚类结果在UMAP上的可视化')
Silhouette Score: 0.33
Davies-Bouldin Index: 0.95
Calinski-Harabasz Index: 52.03
Agglomerative Clustering 聚集聚类
from sklearn.cluster import AgglomerativeClustering
# define the model
ac = AgglomerativeClustering(n_clusters=3)
# fit model and predict clusters
ac_labels = ac.fit_predict(d12_Pb)
ac_labels
array([0, 2, 2, 2, 0, 2, 1, 2, 1, 2, 2, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
0, 1, 0, 0, 0])
clustering_evaluation(d12_Pb, ac_labels)
clustering_plot(d12_Pb, ac_labels, 'Agglomerative Clustering聚类结果在UMAP上的可视化')
Silhouette Score: 0.51
Davies-Bouldin Index: 0.85
Calinski-Harabasz Index: 63.44
BIRCH聚类
from sklearn.cluster import Birch
bir = Birch(threshold=0.01, n_clusters=3)
# fit the model
bir.fit(d12_Pb)
# assign a cluster to each example
bir_labels = bir.predict(d12_Pb)
print(bir_labels)
clustering_evaluation(d12_Pb, bir_labels)
clustering_plot(d12_Pb, bir_labels, 'BIRCH聚类结果在UMAP上的可视化')
[0 2 2 2 0 2 1 2 1 2 2 1 1 0 0 1 1 1 0 1 0 1 0 0 0 0 1 1 0 0 1 1 1 1 1 0 1
0 1 0 0 0 1 0 0 1 0 0 0]
Silhouette Score: 0.51
Davies-Bouldin Index: 0.85
Calinski-Harabasz Index: 63.44
OPTICS 聚类
from sklearn.cluster import OPTICS
op = OPTICS(eps=0.8, min_samples=10)
# fit model and predict clusters
op_labels = op.fit_predict(d12_Pb)
print(op_labels)
clustering_evaluation(d12_Pb, op_labels)
clustering_plot(d12_Pb, op_labels, 'OPTICS聚类结果在UMAP上的可视化')
[-1 -1 -1 -1 0 -1 1 -1 -1 -1 -1 1 1 0 0 1 1 1 0 1 0 1 0 -1
-1 0 1 1 -1 0 1 1 1 1 1 0 1 0 -1 0 0 0 1 0 -1 -1 0 0
0]
Silhouette Score: 0.33
Davies-Bouldin Index: 2.80
Calinski-Harabasz Index: 31.64
谱聚类 Spectral Clustering
from sklearn.cluster import SpectralClustering
sc = SpectralClustering(n_clusters=3)
# fit model and predict clusters
sc_labels = model.fit_predict(d12_Pb)
print(sc_labels)
clustering_evaluation(d12_Pb, sc_labels)
clustering_plot(d12_Pb, sc_labels, '谱聚类结果在UMAP上的可视化')
[0 2 2 0 0 1 1 2 1 2 2 1 1 0 0 1 1 1 0 1 0 1 0 0 0 0 1 1 0 0 1 1 0 1 1 0 0
0 0 0 0 0 1 0 0 1 0 0 0]
Silhouette Score: 0.42
Davies-Bouldin Index: 0.81
Calinski-Harabasz Index: 43.88
有监督变量筛选(聚类后筛选化学成分)
X = d12_Pb_norm
y = gmm_labels
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.25, random_state=1)
随机森林度量特征重要性
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
importances = clf.feature_importances_
feature_imp_df = pd.DataFrame({'变量名': X.columns, '重要性程度': importances}).sort_values('重要性程度', ascending=True)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 0.9230769230769231
precision recall f1-score support
0 1.00 0.50 0.67 2
1 0.89 1.00 0.94 8
2 1.00 1.00 1.00 3
accuracy 0.92 13
macro avg 0.96 0.83 0.87 13
weighted avg 0.93 0.92 0.91 13
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
# Linux show Chinese characters *** important
plt.rcParams['font.family'] = 'WenQuanYi Micro Hei'
fig = px.bar(feature_imp_df, y="变量名", x="重要性程度",
title="各化学成分在铅钡玻璃中的重要性程度",
orientation='h', width=600, height=500)
# center title
fig.update_layout(title_x=0.5)
# remove background color
fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})
fig.show()
单变量特征选择 – F 检验
https://scikit-learn.org/stable/modules/feature_selection.html
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
# Create and fit selector
selector = SelectKBest(f_classif, k=10)
selector.fit(X, y)
# Get columns to keep and create new dataframe with those only
cols_idxs = selector.get_support(indices=True)
d12_Pb_norm_F = X.iloc[:,cols_idxs]
d12_Pb_norm_F.head()
二氧化硅(SiO2) | 氧化钠(Na2O) | 氧化钙(CaO) | 氧化铝(Al2O3) | 氧化铁(Fe2O3) | 氧化铜(CuO) | 氧化铅(PbO) | 氧化钡(BaO) | 五氧化二磷(P2O5) | 二氧化硫(SO2) | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.453545 | 0.0 | 0.365625 | 0.380130 | 0.405229 | 0.024598 | 0.626006 | 0.000000 | 0.252654 | 0.000000 |
1 | 0.228723 | 0.0 | 0.231250 | 0.064075 | 0.000000 | 0.984863 | 0.318174 | 0.880959 | 0.254069 | 0.161755 |
2 | 0.012397 | 0.0 | 0.498437 | 0.047516 | 0.000000 | 0.297067 | 0.380069 | 0.863752 | 0.535032 | 0.942320 |
3 | 0.416075 | 0.0 | 0.548437 | 0.161267 | 0.000000 | 0.466414 | 0.264160 | 0.412130 | 0.663836 | 0.000000 |
4 | 0.361053 | 0.0 | 0.457813 | 0.224622 | 0.289760 | 0.332072 | 0.550320 | 0.150917 | 0.624912 | 0.000000 |
递归特征消除RFE
https://www.geeksforgeeks.org/recursive-feature-elimination-with-cross-validation-in-scikit-learn/
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
selector = RFE(estimator=LogisticRegression(), step=1)
selector = selector.fit(X, y)
# Print the optimal number of features
print("Optimal number of features: %d" % selector.n_features_)
# Print the selected features
print("Selected features: %s" % selector.support_)
# Print the rank of features
print("Selected features: %s" % selector.ranking_)
Optimal number of features: 5
Selected features: [ True False True False False True True True False False]
Selected features: [1 2 1 4 3 1 1 1 6 5]
d12_Pb_norm_rfe = X.iloc[:, selector.support_]
d12_Pb_norm_rfe.head()
二氧化硅(SiO2) | 氧化钙(CaO) | 氧化铜(CuO) | 氧化铅(PbO) | 氧化钡(BaO) | |
---|---|---|---|---|---|
0 | 0.453545 | 0.365625 | 0.024598 | 0.626006 | 0.000000 |
1 | 0.228723 | 0.231250 | 0.984863 | 0.318174 | 0.880959 |
2 | 0.012397 | 0.498437 | 0.297067 | 0.380069 | 0.863752 |
3 | 0.416075 | 0.548437 | 0.466414 | 0.264160 | 0.412130 |
4 | 0.361053 | 0.457813 | 0.332072 | 0.550320 | 0.150917 |
相关阅读: