2022高教社杯全国大学生数学建模竞赛C题问题二(2) Python代码

data learning

已于 2024-09-17 21:16:07 修改

阅读量1.4k

点赞数 24

分类专栏：数学建模文章标签：数学建模 python 聚类

于 2024-09-17 20:29:15 首次发布

本文链接：https://blog.csdn.net/weixin_45481473/article/details/142318056

版权

数学建模专栏收录该内容

8 篇文章 1 订阅

订阅专栏

2.2 对于每个类别选择合适的化学成分对其进行亚类划分，给出具体的划分方法及划分结果，并对分类结果的合理性和敏感性进行分析。

import numpy as np

d12_K = d12.iloc[np.where((d12['类型'] == '高钾'))[0],6:20]
d12_Pb = d12.iloc[np.where((d12['类型'] == '铅钡'))[0],6:20]
print(d12_K.shape)
print(d12_Pb.shape)

(18, 14)
(49, 14)

铅钡玻璃

# Min-Max Normalization
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
d12_Pb_norm = pd.DataFrame(min_max_scaler.fit_transform(d12_Pb), 
             columns=list(d12_Pb.columns))

无监督变量筛选（聚类前筛选化学成分）

https://datascience.stackexchange.com/questions/29572/is-it-possible-to-do-feature-selection-for-unsupervised-machine-learning-problem

在这里插入图片描述

方差过滤法

https://scikit-learn.org/stable/modules/feature_selection.html

from sklearn.feature_selection import VarianceThreshold

def variance_threshold_selector(data, threshold=0.5):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

d12_Pb_norm_var = variance_threshold_selector(d12_Pb_norm, 0.01)
print(d12_Pb_norm_var.shape)
d12_Pb_norm_var.head()

(49, 14)

	二氧化硅(SiO2)	氧化钾(K2O)	氧化钙(CaO)	氧化镁(MgO)	氧化铝(Al2O3)	氧化铁(Fe2O3)	氧化铜(CuO)	氧化铅(PbO)	氧化钡(BaO)	五氧化二磷(P2O5)	氧化锶(SrO)	二氧化硫(SO2)
0	0.453545	0.744681	0.365625	0.432234	0.380130	0.405229	0.024598	0.626006	0.000000	0.252654	0.169643	0.000000
1	0.228723	0.000000	0.231250	0.000000	0.064075	0.000000	0.984863	0.318174	0.880959	0.254069	0.330357	0.161755
2	0.012397	0.000000	0.498437	0.000000	0.047516	0.000000	0.297067	0.380069	0.863752	0.535032	0.473214	0.942320
3	0.416075	0.148936	0.548437	0.260073	0.161267	0.000000	0.466414	0.264160	0.412130	0.663836	0.330357	0.000000
4	0.361053	0.000000	0.457813	0.216117	0.224622	0.289760	0.332072	0.550320	0.150917	0.624912	0.169643	0.000000

拉普拉斯分数

Laplacian Score 是一个对一个训练集样本的特征进行打分的算法。通过这个算法可以给每一个特征打出一个分数，最后再取分数最低的k个特征作为最后选择的特征子集，是标准的 Filter 式方法。

import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform


def cal_lap_score(features, D, L):
    features_ = features - np.sum((features @ D) / np.sum(D))
    L_score = (features_ @ L @ features_) / (features_ @ D @ features_)
    return L_score


def get_k_nearest(dist, k, sample_index):
    # dist is zero means it is the sample itself
    return sorted(
        range(len(dist)),
        key=lambda i: dist[i] if i != sample_index else np.inf
    )[:k] + [sample_index]


def laplacian_score(df_arr, label=None, **kwargs):
    kwargs.setdefault("k_nearest", 5)
    
    '''
    Construct distance matrix, dist_matrix, using euclidean distance
    '''
    distances = pdist(df_arr, metric='euclidean')
    dist_matrix = squareform(distances)
    del distances
    '''
    Determine the edge of each sample pairs by k nearest neighbor
    '''
    edge_sparse_matrix = pd.DataFrame(
        np.zeros((df_arr.shape[0], df_arr.shape[0])),
        dtype=int
    )
    if label is None:
        for row_id, row in enumerate(dist_matrix):
            k_nearest_id = get_k_nearest(row, kwargs["k_nearest"], row_id)
            edge_sparse_matrix.iloc[k_nearest_id, k_nearest_id] = 1
    else:
        label = np.array(label)
        unique_label = np.unique(label)
        for i in unique_label:
            group_index = np.where(label == i)[0]
            edge_sparse_matrix.iloc[group_index, group_index] = 1
    S = dist_matrix * edge_sparse_matrix
    del dist_matrix, edge_sparse_matrix
    '''
    Calculate the Laplacian graph L
    '''
    D = np.diag(S.sum(axis=1))
    L = D - S
    del S
    '''
    Minimize the Laplacian score
    '''
    features_lap_score = np.apply_along_axis(
        func1d=lambda f: cal_lap_score(f, D, L), axis=0, arr=df_arr
    )
    return features_lap_score

data = {'化学成分': list(d12_Pb.columns),
        '拉普拉斯分数': list(laplacian_score(np.array(d12_Pb_norm), k_nearest=3))}
d12_Pb_norm_lap = pd.DataFrame(data)
d12_Pb_norm_lap = d12_Pb_norm_lap.sort_values('拉普拉斯分数')
d12_Pb_norm_lap

	化学成分	拉普拉斯分数
0	二氧化硅(SiO2)	0.287870
8	氧化铅(PbO)	0.304957
9	氧化钡(BaO)	0.395260
1	氧化钠(Na2O)	0.397298
10	五氧化二磷(P2O5)	0.428794
4	氧化镁(MgO)	0.461562
5	氧化铝(Al2O3)	0.473288
3	氧化钙(CaO)	0.485366
6	氧化铁(Fe2O3)	0.486807
7	氧化铜(CuO)	0.569931
11	氧化锶(SrO)	0.613548
13	二氧化硫(SO2)	0.789059
2	氧化钾(K2O)	0.828154
12	氧化锡(SnO2)	0.937966

import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np

# Linux show Chinese characters *** important
plt.rcParams['font.family'] = 'WenQuanYi Micro Hei' 

fig = px.bar(d12_Pb_norm_lap, y="化学成分", x="拉普拉斯分数",
             title="各化学成分的拉普拉斯分数", orientation='h')
# center title
fig.update_layout(title_x=0.5)
# remove background color
fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

fig.show()

在这里插入图片描述

聚类

https://machinelearningmastery.com/clustering-algorithms-with-python/

K-means聚类

from sklearn.cluster import KMeans
fig = plt.figure(figsize=(10,8))
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300

inertias = []
for i in range(1,11):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(d12_Pb)
    inertias.append(kmeans.inertia_)

plt.plot(range(1,11), inertias, marker='o', linewidth=3)
plt.title('Elbow (铅钡玻璃)')
plt.xlabel('聚类数量')
plt.ylabel('Inertia')
plt.rc('font', size=20)  
plt.rc('figure', titlesize=20)
plt.show()

在这里插入图片描述

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(d12_Pb)
km_labels = kmeans.labels_
km_labels

array([2, 0, 0, 0, 2, 0, 1, 0, 1, 0, 0, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 1,
       2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 2,
       2, 1, 2, 2, 2], dtype=int32)

pd.DataFrame(kmeans.cluster_centers_)

	0	1	2	3	4	5	6	7	8	9	10	11	12	13
0	21.592857	0.000000	0.188571	1.871429	0.101429	2.008571	0.215714	6.555714	26.344286	27.705714	5.084286	0.464286	6.938894e-18	5.074286
1	58.496500	1.975000	0.161500	1.199500	0.739000	5.044500	0.580000	0.984500	20.413000	7.196500	0.896500	0.234000	7.700000e-02	0.183000
2	26.537273	0.219091	0.179545	2.880000	0.733636	2.943636	0.865000	1.205455	47.338182	8.007273	4.900909	0.414545	5.954545e-02	0.000000

聚类模型评估指标

https://www.geeksforgeeks.org/clustering-metrics/

def clustering_evaluation(data, label):
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
    from sklearn.metrics import mutual_info_score, adjusted_rand_score

    # Calculate clustering metrics
    silhouette = silhouette_score(data, label)
    db_index = davies_bouldin_score(data, label)
    ch_index = calinski_harabasz_score(data, label)

    # Print the metric scores
    print(f"Silhouette Score: {silhouette:.2f}")
    print(f"Davies-Bouldin Index: {db_index:.2f}")
    print(f"Calinski-Harabasz Index: {ch_index:.2f}")

clustering_evaluation(d12_Pb, km_labels)

Silhouette Score: 0.51
Davies-Bouldin Index: 0.85
Calinski-Harabasz Index: 63.26

使用UMAP方法对聚类结果进行可视化

def clustering_plot(data, label, title):
    import umap.umap_ as umap
    reducer = umap.UMAP(random_state=42)
    embedding = reducer.fit_transform(data)
    
    plt.scatter(
    embedding[:, 0],
    embedding[:, 1],
    c=label)
    plt.title(title, fontsize=16);

clustering_plot(d12_Pb, km_labels, 'K-Means聚类结果在UMAP上的可视化')

在这里插入图片描述

层次聚类

from scipy.cluster.hierarchy import dendrogram, linkage
import numpy as np
import matplotlib.pyplot as plt

linkage_data = linkage(d12_Pb, method='ward', metric='euclidean')
dendrogram(linkage_data)
plt.show()

在这里插入图片描述

from sklearn.cluster import AgglomerativeClustering
hierarchical_cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
hc_labels = hierarchical_cluster.fit_predict(d12_Pb)
hc_labels

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0])

clustering_evaluation(d12_Pb, hc_labels)
clustering_plot(d12_Pb, hc_labels, '层次聚类结果在UMAP上的可视化')

Silhouette Score: 0.50
Davies-Bouldin Index: 0.70
Calinski-Harabasz Index: 66.81

在这里插入图片描述

高斯混合模型 GMM

from sklearn.mixture import GaussianMixture

model = GaussianMixture(n_components=3)
# fit the model
model.fit(d12_Pb)
gmm_labels = model.predict(d12_Pb)
print('clustering labels: ')
gmm_labels

clustering labels: 

array([1, 2, 2, 1, 1, 0, 0, 2, 0, 2, 2, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1])

clustering_evaluation(d12_Pb, gmm_labels)
clustering_plot(d12_Pb, gmm_labels, 'GMM聚类结果在UMAP上的可视化')

Silhouette Score: 0.51
Davies-Bouldin Index: 0.75
Calinski-Harabasz Index: 61.87

在这里插入图片描述

Leiden莱顿聚类

from leiden_clustering import LeidenClustering
import numpy as np
clustering = LeidenClustering()
clustering.fit(d12_Pb)
leiden_labels = clustering.labels_
leiden_labels

array([0, 2, 2, 2, 0, 2, 1, 2, 1, 2, 2, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0])

clustering_evaluation(d12_Pb, leiden_labels)
clustering_plot(d12_Pb, leiden_labels, 'Leiden聚类结果在UMAP上的可视化')

Silhouette Score: 0.51
Davies-Bouldin Index: 0.85
Calinski-Harabasz Index: 63.26

在这里插入图片描述

Affinity Propagation 亲和力传播

from sklearn.cluster import AffinityPropagation

# define the model
ap = AffinityPropagation()
# fit the model
ap.fit(d12_Pb)
# assign a cluster to each example
ap_labels = ap.predict(d12_Pb)
ap_labels

array([1, 2, 0, 2, 1, 2, 4, 2, 4, 2, 0, 3, 3, 1, 1, 3, 3, 3, 1, 3, 1, 3,
       1, 5, 5, 1, 4, 4, 5, 1, 3, 3, 4, 4, 4, 1, 4, 5, 4, 1, 5, 1, 3, 5,
       5, 4, 1, 5, 1])

clustering_evaluation(d12_Pb, ap_labels)
clustering_plot(d12_Pb, ap_labels, 'Affinity Propagation聚类结果在UMAP上的可视化')

Silhouette Score: 0.33
Davies-Bouldin Index: 0.95
Calinski-Harabasz Index: 52.03

在这里插入图片描述

Agglomerative Clustering 聚集聚类

from sklearn.cluster import AgglomerativeClustering

# define the model
ac = AgglomerativeClustering(n_clusters=3)
# fit model and predict clusters
ac_labels = ac.fit_predict(d12_Pb)
ac_labels

array([0, 2, 2, 2, 0, 2, 1, 2, 1, 2, 2, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0])

clustering_evaluation(d12_Pb, ac_labels)
clustering_plot(d12_Pb, ac_labels, 'Agglomerative Clustering聚类结果在UMAP上的可视化')

Silhouette Score: 0.51
Davies-Bouldin Index: 0.85
Calinski-Harabasz Index: 63.44

在这里插入图片描述

BIRCH聚类

from sklearn.cluster import Birch

bir = Birch(threshold=0.01, n_clusters=3)
# fit the model
bir.fit(d12_Pb)
# assign a cluster to each example
bir_labels = bir.predict(d12_Pb)
print(bir_labels)

clustering_evaluation(d12_Pb, bir_labels)
clustering_plot(d12_Pb, bir_labels, 'BIRCH聚类结果在UMAP上的可视化')

[0 2 2 2 0 2 1 2 1 2 2 1 1 0 0 1 1 1 0 1 0 1 0 0 0 0 1 1 0 0 1 1 1 1 1 0 1
 0 1 0 0 0 1 0 0 1 0 0 0]
Silhouette Score: 0.51
Davies-Bouldin Index: 0.85
Calinski-Harabasz Index: 63.44

在这里插入图片描述

OPTICS 聚类

from sklearn.cluster import OPTICS

op = OPTICS(eps=0.8, min_samples=10)
# fit model and predict clusters
op_labels = op.fit_predict(d12_Pb)
print(op_labels)

clustering_evaluation(d12_Pb, op_labels)
clustering_plot(d12_Pb, op_labels, 'OPTICS聚类结果在UMAP上的可视化')

[-1 -1 -1 -1  0 -1  1 -1 -1 -1 -1  1  1  0  0  1  1  1  0  1  0  1  0 -1
 -1  0  1  1 -1  0  1  1  1  1  1  0  1  0 -1  0  0  0  1  0 -1 -1  0  0
  0]
Silhouette Score: 0.33
Davies-Bouldin Index: 2.80
Calinski-Harabasz Index: 31.64

在这里插入图片描述

谱聚类 Spectral Clustering

from sklearn.cluster import SpectralClustering

sc = SpectralClustering(n_clusters=3)
# fit model and predict clusters
sc_labels = model.fit_predict(d12_Pb)
print(sc_labels)

clustering_evaluation(d12_Pb, sc_labels)
clustering_plot(d12_Pb, sc_labels, '谱聚类结果在UMAP上的可视化')

[0 2 2 0 0 1 1 2 1 2 2 1 1 0 0 1 1 1 0 1 0 1 0 0 0 0 1 1 0 0 1 1 0 1 1 0 0
 0 0 0 0 0 1 0 0 1 0 0 0]
Silhouette Score: 0.42
Davies-Bouldin Index: 0.81
Calinski-Harabasz Index: 43.88

在这里插入图片描述

有监督变量筛选（聚类后筛选化学成分）

在这里插入图片描述

X = d12_Pb_norm
y = gmm_labels

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                 test_size=0.25, random_state=1)

随机森林度量特征重要性

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics  
from sklearn.metrics import classification_report

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

importances = clf.feature_importances_
feature_imp_df = pd.DataFrame({'变量名': X.columns, '重要性程度': importances}).sort_values('重要性程度', ascending=True) 

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9230769230769231
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.89      1.00      0.94         8
           2       1.00      1.00      1.00         3

    accuracy                           0.92        13
   macro avg       0.96      0.83      0.87        13
weighted avg       0.93      0.92      0.91        13

import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np

# Linux show Chinese characters *** important
plt.rcParams['font.family'] = 'WenQuanYi Micro Hei' 

fig = px.bar(feature_imp_df, y="变量名", x="重要性程度",
             title="各化学成分在铅钡玻璃中的重要性程度", 
             orientation='h', width=600, height=500)
# center title
fig.update_layout(title_x=0.5)
# remove background color
fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

fig.show()

在这里插入图片描述

单变量特征选择 – F 检验

https://scikit-learn.org/stable/modules/feature_selection.html

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# Create and fit selector
selector = SelectKBest(f_classif, k=10)
selector.fit(X, y)
# Get columns to keep and create new dataframe with those only
cols_idxs = selector.get_support(indices=True)
d12_Pb_norm_F = X.iloc[:,cols_idxs]
d12_Pb_norm_F.head()

	二氧化硅(SiO2)	氧化钙(CaO)	氧化铝(Al2O3)	氧化铁(Fe2O3)	氧化铜(CuO)	氧化铅(PbO)	氧化钡(BaO)	五氧化二磷(P2O5)	二氧化硫(SO2)
0	0.453545	0.365625	0.380130	0.405229	0.024598	0.626006	0.000000	0.252654	0.000000
1	0.228723	0.231250	0.064075	0.000000	0.984863	0.318174	0.880959	0.254069	0.161755
2	0.012397	0.498437	0.047516	0.000000	0.297067	0.380069	0.863752	0.535032	0.942320
3	0.416075	0.548437	0.161267	0.000000	0.466414	0.264160	0.412130	0.663836	0.000000
4	0.361053	0.457813	0.224622	0.289760	0.332072	0.550320	0.150917	0.624912	0.000000

递归特征消除RFE

https://www.geeksforgeeks.org/recursive-feature-elimination-with-cross-validation-in-scikit-learn/

from sklearn.feature_selection import RFE
from sklearn.svm import SVR

selector = RFE(estimator=LogisticRegression(), step=1)
selector = selector.fit(X, y)

# Print the optimal number of features 
print("Optimal number of features: %d" % selector.n_features_) 
  
# Print the selected features 
print("Selected features: %s" % selector.support_) 

# Print the rank of features 
print("Selected features: %s" % selector.ranking_)

Optimal number of features: 5
Selected features: [ True False  True False False  True  True  True False False]
Selected features: [1 2 1 4 3 1 1 1 6 5]

d12_Pb_norm_rfe = X.iloc[:, selector.support_] 
d12_Pb_norm_rfe.head()

	二氧化硅(SiO2)	氧化钙(CaO)	氧化铜(CuO)	氧化铅(PbO)	氧化钡(BaO)
0	0.453545	0.365625	0.024598	0.626006	0.000000
1	0.228723	0.231250	0.984863	0.318174	0.880959
2	0.012397	0.498437	0.297067	0.380069	0.863752
3	0.416075	0.548437	0.466414	0.264160	0.412130
4	0.361053	0.457813	0.332072	0.550320	0.150917