2022高教社杯全国大学生数学建模竞赛C题 问题二(2) Python代码

2.2 对于每个类别选择合适的化学成分对其进行亚类划分,给出具体的划分方法及划分结果,并对分类结果的合理性和敏感性进行分析。

import numpy as np

d12_K = d12.iloc[np.where((d12['类型'] == '高钾'))[0],6:20]
d12_Pb = d12.iloc[np.where((d12['类型'] == '铅钡'))[0],6:20]
print(d12_K.shape)
print(d12_Pb.shape)
(18, 14)
(49, 14)

铅钡玻璃

# Min-Max Normalization
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
d12_Pb_norm = pd.DataFrame(min_max_scaler.fit_transform(d12_Pb), 
             columns=list(d12_Pb.columns))
无监督变量筛选(聚类前筛选化学成分)

https://datascience.stackexchange.com/questions/29572/is-it-possible-to-do-feature-selection-for-unsupervised-machine-learning-problem

在这里插入图片描述

方差过滤法

https://scikit-learn.org/stable/modules/feature_selection.html

from sklearn.feature_selection import VarianceThreshold

def variance_threshold_selector(data, threshold=0.5):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

d12_Pb_norm_var = variance_threshold_selector(d12_Pb_norm, 0.01)
print(d12_Pb_norm_var.shape)
d12_Pb_norm_var.head()
(49, 14)
二氧化硅(SiO2)氧化钠(Na2O)氧化钾(K2O)氧化钙(CaO)氧化镁(MgO)氧化铝(Al2O3)氧化铁(Fe2O3)氧化铜(CuO)氧化铅(PbO)氧化钡(BaO)五氧化二磷(P2O5)氧化锶(SrO)氧化锡(SnO2)二氧化硫(SO2)
00.4535450.00.7446810.3656250.4322340.3801300.4052290.0245980.6260060.0000000.2526540.1696430.00.000000
10.2287230.00.0000000.2312500.0000000.0640750.0000000.9848630.3181740.8809590.2540690.3303570.00.161755
20.0123970.00.0000000.4984370.0000000.0475160.0000000.2970670.3800690.8637520.5350320.4732140.00.942320
30.4160750.00.1489360.5484370.2600730.1612670.0000000.4664140.2641600.4121300.6638360.3303570.00.000000
40.3610530.00.0000000.4578130.2161170.2246220.2897600.3320720.5503200.1509170.6249120.1696430.00.000000
拉普拉斯分数

Laplacian Score 是一个对一个训练集样本的特征进行打分的算法。通过这个算法可以给每一个特征打出一个分数,最后再取分数最低的k个特征作为最后选择的特征子集,是标准的 Filter 式方法。

import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform


def cal_lap_score(features, D, L):
    features_ = features - np.sum((features @ D) / np.sum(D))
    L_score = (features_ @ L @ features_) / (features_ @ D @ features_)
    return L_score


def get_k_nearest(dist, k, sample_index):
    # dist is zero means it is the sample itself
    return sorted(
        range(len(dist)),
        key=lambda i: dist[i] if i != sample_index else np.inf
    )[:k] + [sample_index]


def laplacian_score(df_arr, label=None, **kwargs):
    kwargs.setdefault("k_nearest", 5)
    
    '''
    Construct distance matrix, dist_matrix, using euclidean distance
    '''
    distances = pdist(df_arr, metric='euclidean')
    dist_matrix = squareform(distances)
    del distances
    '''
    Determine the edge of each sample pairs by k nearest neighbor
    '''
    edge_sparse_matrix = pd.DataFrame(
        np.zeros((df_arr.shape[0], df_arr.shape[0])),
        dtype=int
    )
    if label is None:
        for row_id, row in enumerate(dist_matrix):
            k_nearest_id = get_k_nearest(row, kwargs["k_nearest"], row_id)
            edge_sparse_matrix.iloc[k_nearest_id, k_nearest_id] = 1
    else:
        label = np.array(label)
        unique_label = np.unique(label)
        for i in unique_label:
            group_index = np.where(label == i)[0]
            edge_sparse_matrix.iloc[group_index, group_index] = 1
    S = dist_matrix * edge_sparse_matrix
    del dist_matrix, edge_sparse_matrix
    '''
    Calculate the Laplacian graph L
    '''
    D = np.diag(S.sum(axis=1))
    L = D - S
    del S
    '''
    Minimize the Laplacian score
    '''
    features_lap_score = np.apply_along_axis(
        func1d=lambda f: cal_lap_score(f, D, L), axis=0, arr=df_arr
    )
    return features_lap_score
data = {'化学成分': list(d12_Pb.columns),
        '拉普拉斯分数': list(laplacian_score(np.array(d12_Pb_norm), k_nearest=3))}
d12_Pb_norm_lap = pd.DataFrame(data)
d12_Pb_norm_lap = d12_Pb_norm_lap.sort_values('拉普拉斯分数')
d12_Pb_norm_lap
化学成分拉普拉斯分数
0二氧化硅(SiO2)0.287870
8氧化铅(PbO)0.304957
9氧化钡(BaO)0.395260
1氧化钠(Na2O)0.397298
10五氧化二磷(P2O5)0.428794
4氧化镁(MgO)0.461562
5氧化铝(Al2O3)0.473288
3氧化钙(CaO)0.485366
6氧化铁(Fe2O3)0.486807
7氧化铜(CuO)0.569931
11氧化锶(SrO)0.613548
13二氧化硫(SO2)0.789059
2氧化钾(K2O)0.828154
12氧化锡(SnO2)0.937966
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np

# Linux show Chinese characters *** important
plt.rcParams['font.family'] = 'WenQuanYi Micro Hei' 

fig = px.bar(d12_Pb_norm_lap, y="化学成分", x="拉普拉斯分数",
             title="各化学成分的拉普拉斯分数", orientation='h')
# center title
fig.update_layout(title_x=0.5)
# remove background color
fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

fig.show()

在这里插入图片描述

聚类

https://machinelearningmastery.com/clustering-algorithms-with-python/

K-means聚类
from sklearn.cluster import KMeans
fig = plt.figure(figsize=(10,8))
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300

inertias = []
for i in range(1,11):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(d12_Pb)
    inertias.append(kmeans.inertia_)

plt.plot(range(1,11), inertias, marker='o', linewidth=3)
plt.title('Elbow (铅钡玻璃)')
plt.xlabel('聚类数量')
plt.ylabel('Inertia')
plt.rc('font', size=20)  
plt.rc('figure', titlesize=20)
plt.show()

在这里插入图片描述

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(d12_Pb)
km_labels = kmeans.labels_
km_labels
array([2, 0, 0, 0, 2, 0, 1, 0, 1, 0, 0, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 1,
       2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 2,
       2, 1, 2, 2, 2], dtype=int32)
pd.DataFrame(kmeans.cluster_centers_)
012345678910111213
021.5928570.0000000.1885711.8714290.1014292.0085710.2157146.55571426.34428627.7057145.0842860.4642866.938894e-185.074286
158.4965001.9750000.1615001.1995000.7390005.0445000.5800000.98450020.4130007.1965000.8965000.2340007.700000e-020.183000
226.5372730.2190910.1795452.8800000.7336362.9436360.8650001.20545547.3381828.0072734.9009090.4145455.954545e-020.000000

聚类模型评估指标

https://www.geeksforgeeks.org/clustering-metrics/

def clustering_evaluation(data, label):
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
    from sklearn.metrics import mutual_info_score, adjusted_rand_score

    # Calculate clustering metrics
    silhouette = silhouette_score(data, label)
    db_index = davies_bouldin_score(data, label)
    ch_index = calinski_harabasz_score(data, label)

    # Print the metric scores
    print(f"Silhouette Score: {silhouette:.2f}")
    print(f"Davies-Bouldin Index: {db_index:.2f}")
    print(f"Calinski-Harabasz Index: {ch_index:.2f}")
clustering_evaluation(d12_Pb, km_labels)
Silhouette Score: 0.51
Davies-Bouldin Index: 0.85
Calinski-Harabasz Index: 63.26

使用UMAP方法对聚类结果进行可视化

def clustering_plot(data, label, title):
    import umap.umap_ as umap
    reducer = umap.UMAP(random_state=42)
    embedding = reducer.fit_transform(data)
    
    plt.scatter(
    embedding[:, 0],
    embedding[:, 1],
    c=label)
    plt.title(title, fontsize=16);
clustering_plot(d12_Pb, km_labels, 'K-Means聚类结果在UMAP上的可视化')

在这里插入图片描述

层次聚类
from scipy.cluster.hierarchy import dendrogram, linkage
import numpy as np
import matplotlib.pyplot as plt

linkage_data = linkage(d12_Pb, method='ward', metric='euclidean')
dendrogram(linkage_data)
plt.show()

在这里插入图片描述

from sklearn.cluster import AgglomerativeClustering
hierarchical_cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
hc_labels = hierarchical_cluster.fit_predict(d12_Pb)
hc_labels
array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0])
clustering_evaluation(d12_Pb, hc_labels)
clustering_plot(d12_Pb, hc_labels, '层次聚类结果在UMAP上的可视化')
Silhouette Score: 0.50
Davies-Bouldin Index: 0.70
Calinski-Harabasz Index: 66.81

在这里插入图片描述

高斯混合模型 GMM
from sklearn.mixture import GaussianMixture

model = GaussianMixture(n_components=3)
# fit the model
model.fit(d12_Pb)
gmm_labels = model.predict(d12_Pb)
print('clustering labels: ')
gmm_labels
clustering labels: 

array([1, 2, 2, 1, 1, 0, 0, 2, 0, 2, 2, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1])
clustering_evaluation(d12_Pb, gmm_labels)
clustering_plot(d12_Pb, gmm_labels, 'GMM聚类结果在UMAP上的可视化')
Silhouette Score: 0.51
Davies-Bouldin Index: 0.75
Calinski-Harabasz Index: 61.87

在这里插入图片描述

Leiden莱顿聚类
from leiden_clustering import LeidenClustering
import numpy as np
clustering = LeidenClustering()
clustering.fit(d12_Pb)
leiden_labels = clustering.labels_
leiden_labels
array([0, 2, 2, 2, 0, 2, 1, 2, 1, 2, 2, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0])
clustering_evaluation(d12_Pb, leiden_labels)
clustering_plot(d12_Pb, leiden_labels, 'Leiden聚类结果在UMAP上的可视化')
Silhouette Score: 0.51
Davies-Bouldin Index: 0.85
Calinski-Harabasz Index: 63.26

在这里插入图片描述

Affinity Propagation 亲和力传播
from sklearn.cluster import AffinityPropagation

# define the model
ap = AffinityPropagation()
# fit the model
ap.fit(d12_Pb)
# assign a cluster to each example
ap_labels = ap.predict(d12_Pb)
ap_labels
array([1, 2, 0, 2, 1, 2, 4, 2, 4, 2, 0, 3, 3, 1, 1, 3, 3, 3, 1, 3, 1, 3,
       1, 5, 5, 1, 4, 4, 5, 1, 3, 3, 4, 4, 4, 1, 4, 5, 4, 1, 5, 1, 3, 5,
       5, 4, 1, 5, 1])
clustering_evaluation(d12_Pb, ap_labels)
clustering_plot(d12_Pb, ap_labels, 'Affinity Propagation聚类结果在UMAP上的可视化')
Silhouette Score: 0.33
Davies-Bouldin Index: 0.95
Calinski-Harabasz Index: 52.03

在这里插入图片描述

Agglomerative Clustering 聚集聚类
from sklearn.cluster import AgglomerativeClustering

# define the model
ac = AgglomerativeClustering(n_clusters=3)
# fit model and predict clusters
ac_labels = ac.fit_predict(d12_Pb)
ac_labels
array([0, 2, 2, 2, 0, 2, 1, 2, 1, 2, 2, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0])
clustering_evaluation(d12_Pb, ac_labels)
clustering_plot(d12_Pb, ac_labels, 'Agglomerative Clustering聚类结果在UMAP上的可视化')
Silhouette Score: 0.51
Davies-Bouldin Index: 0.85
Calinski-Harabasz Index: 63.44

在这里插入图片描述

BIRCH聚类
from sklearn.cluster import Birch

bir = Birch(threshold=0.01, n_clusters=3)
# fit the model
bir.fit(d12_Pb)
# assign a cluster to each example
bir_labels = bir.predict(d12_Pb)
print(bir_labels)

clustering_evaluation(d12_Pb, bir_labels)
clustering_plot(d12_Pb, bir_labels, 'BIRCH聚类结果在UMAP上的可视化')
[0 2 2 2 0 2 1 2 1 2 2 1 1 0 0 1 1 1 0 1 0 1 0 0 0 0 1 1 0 0 1 1 1 1 1 0 1
 0 1 0 0 0 1 0 0 1 0 0 0]
Silhouette Score: 0.51
Davies-Bouldin Index: 0.85
Calinski-Harabasz Index: 63.44

在这里插入图片描述

OPTICS 聚类
from sklearn.cluster import OPTICS

op = OPTICS(eps=0.8, min_samples=10)
# fit model and predict clusters
op_labels = op.fit_predict(d12_Pb)
print(op_labels)

clustering_evaluation(d12_Pb, op_labels)
clustering_plot(d12_Pb, op_labels, 'OPTICS聚类结果在UMAP上的可视化')
[-1 -1 -1 -1  0 -1  1 -1 -1 -1 -1  1  1  0  0  1  1  1  0  1  0  1  0 -1
 -1  0  1  1 -1  0  1  1  1  1  1  0  1  0 -1  0  0  0  1  0 -1 -1  0  0
  0]
Silhouette Score: 0.33
Davies-Bouldin Index: 2.80
Calinski-Harabasz Index: 31.64

在这里插入图片描述

谱聚类 Spectral Clustering
from sklearn.cluster import SpectralClustering

sc = SpectralClustering(n_clusters=3)
# fit model and predict clusters
sc_labels = model.fit_predict(d12_Pb)
print(sc_labels)

clustering_evaluation(d12_Pb, sc_labels)
clustering_plot(d12_Pb, sc_labels, '谱聚类结果在UMAP上的可视化')
[0 2 2 0 0 1 1 2 1 2 2 1 1 0 0 1 1 1 0 1 0 1 0 0 0 0 1 1 0 0 1 1 0 1 1 0 0
 0 0 0 0 0 1 0 0 1 0 0 0]
Silhouette Score: 0.42
Davies-Bouldin Index: 0.81
Calinski-Harabasz Index: 43.88

在这里插入图片描述

有监督变量筛选(聚类后筛选化学成分)

在这里插入图片描述

X = d12_Pb_norm
y = gmm_labels

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                 test_size=0.25, random_state=1)
随机森林度量特征重要性
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics  
from sklearn.metrics import classification_report

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

importances = clf.feature_importances_
feature_imp_df = pd.DataFrame({'变量名': X.columns, '重要性程度': importances}).sort_values('重要性程度', ascending=True) 

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 0.9230769230769231
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.89      1.00      0.94         8
           2       1.00      1.00      1.00         3

    accuracy                           0.92        13
   macro avg       0.96      0.83      0.87        13
weighted avg       0.93      0.92      0.91        13
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np

# Linux show Chinese characters *** important
plt.rcParams['font.family'] = 'WenQuanYi Micro Hei' 

fig = px.bar(feature_imp_df, y="变量名", x="重要性程度",
             title="各化学成分在铅钡玻璃中的重要性程度", 
             orientation='h', width=600, height=500)
# center title
fig.update_layout(title_x=0.5)
# remove background color
fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

fig.show()

在这里插入图片描述

单变量特征选择 – F 检验

https://scikit-learn.org/stable/modules/feature_selection.html

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# Create and fit selector
selector = SelectKBest(f_classif, k=10)
selector.fit(X, y)
# Get columns to keep and create new dataframe with those only
cols_idxs = selector.get_support(indices=True)
d12_Pb_norm_F = X.iloc[:,cols_idxs]
d12_Pb_norm_F.head()
二氧化硅(SiO2)氧化钠(Na2O)氧化钙(CaO)氧化铝(Al2O3)氧化铁(Fe2O3)氧化铜(CuO)氧化铅(PbO)氧化钡(BaO)五氧化二磷(P2O5)二氧化硫(SO2)
00.4535450.00.3656250.3801300.4052290.0245980.6260060.0000000.2526540.000000
10.2287230.00.2312500.0640750.0000000.9848630.3181740.8809590.2540690.161755
20.0123970.00.4984370.0475160.0000000.2970670.3800690.8637520.5350320.942320
30.4160750.00.5484370.1612670.0000000.4664140.2641600.4121300.6638360.000000
40.3610530.00.4578130.2246220.2897600.3320720.5503200.1509170.6249120.000000
递归特征消除RFE

https://www.geeksforgeeks.org/recursive-feature-elimination-with-cross-validation-in-scikit-learn/

from sklearn.feature_selection import RFE
from sklearn.svm import SVR

selector = RFE(estimator=LogisticRegression(), step=1)
selector = selector.fit(X, y)

# Print the optimal number of features 
print("Optimal number of features: %d" % selector.n_features_) 
  
# Print the selected features 
print("Selected features: %s" % selector.support_) 

# Print the rank of features 
print("Selected features: %s" % selector.ranking_) 
Optimal number of features: 5
Selected features: [ True False  True False False  True  True  True False False]
Selected features: [1 2 1 4 3 1 1 1 6 5]
d12_Pb_norm_rfe = X.iloc[:, selector.support_] 
d12_Pb_norm_rfe.head()
二氧化硅(SiO2)氧化钙(CaO)氧化铜(CuO)氧化铅(PbO)氧化钡(BaO)
00.4535450.3656250.0245980.6260060.000000
10.2287230.2312500.9848630.3181740.880959
20.0123970.4984370.2970670.3800690.863752
30.4160750.5484370.4664140.2641600.412130
40.3610530.4578130.3320720.5503200.150917

相关阅读:

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值