完整版随后更新到资源中!
1.问题1
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
for m in range(2,16):
# 设置kde=True添加密度曲线
sns.distplot(data_2_1_new.iloc[:,m],hist=True, kde=True)
plt.show()
# 解决中文显示问题
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
for i in range(2,16):
plt.title('%s含量的变化情况'%(data_2_1_new.columns[i]))
plt.scatter(data_2_1_new.iloc[:14,0],data_2_1_new.iloc[:14,i],c='green',label='未风化')
plt.scatter(data_2_1_new.iloc[14:,0],data_2_1_new.iloc[14:,i],c='red',label='风化')
plt.legend()
plt.show()
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
for m in range(2,16):
# 设置kde=True添加密度曲线
sns.distplot(data_3_1.iloc[:,m],hist=True, kde=True)
plt.show()
# 解决中文显示问题
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
for i in range(2,16):
plt.title('%s含量的变化情况'%(data_3_1.columns[i]))
plt.scatter(data_3_1.iloc[:24,0],data_3_1.iloc[:24,i],c='green',label='未风化')
plt.scatter(data_3_1.iloc[24:,0],data_3_1.iloc[24:,i],c='red',label='风化')
plt.legend()
plt.show()
2.问题2
from tqdm import tqdm
import re
tqdm.pandas()
def clear_id(s):
ss = str(s)
n = int(re.findall(r"\d+",ss)[0])
return n
data_2_['id'] = data_2_['文物采样点'].progress_apply(clear_id)
data_2['id'] = data_2['文物编号'].progress_apply(clear_id)
cols = ['纹饰', '类型', '颜色', '表面风化', '二氧化硅(SiO2)',
'氧化钠(Na2O)', '氧化钾(K2O)', '氧化钙(CaO)', '氧化镁(MgO)', '氧化铝(Al2O3)',
'氧化铁(Fe2O3)', '氧化铜(CuO)', '氧化铅(PbO)', '氧化钡(BaO)', '五氧化二磷(P2O5)',
'氧化锶(SrO)', '氧化锡(SnO2)', '二氧化硫(SO2)']
cols1 = ['二氧化硅(SiO2)',
'氧化钠(Na2O)', '氧化钾(K2O)', '氧化钙(CaO)', '氧化镁(MgO)', '氧化铝(Al2O3)',
'氧化铁(Fe2O3)', '氧化铜(CuO)', '氧化铅(PbO)', '氧化钡(BaO)', '五氧化二磷(P2O5)',
'氧化锶(SrO)', '氧化锡(SnO2)', '二氧化硫(SO2)']
cols2 = ['纹饰', '类型', '颜色', '表面风化']
file_data = pd.merge(data_2,data_2_,on='id')
Gao_data = file_data[file_data['类型']=='高钾'][cols]
Qian_data = file_data[file_data['类型']=='铅钡'][cols]
from sklearn.preprocessing import LabelEncoder
Gao_train = pd.DataFrame() # 用于存放编码后的训练数据
label = LabelEncoder() # 标号编码器
X= Gao_data
for c in X.columns:
if X[c].dtype=='object': # 如果是字符串形式的(字符串读到pandas里dtype是object)
Gao_train[c] = label.fit_transform(X[c]) # 将整个这一列进行标号编码,写到新的dataframe里
else: # 其它类型的特征(数值,布尔)保持原样写入
Gao_train[c] = list(X[c])
Qian_train = pd.DataFrame()
label = LabelEncoder()
X= Qian_data
for c in X.columns:
if X[c].dtype=='object':
Qian_train[c] = label.fit_transform(X[c])
else:
Qian_train[c] = list(X[c])
from sklearn.cluster import AgglomerativeClustering #导入sklearn的层次聚类函数
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['STSong']
data = np.array(Gao_train)
Scores = [] # 存放轮廓系数
SSE = [] # 存放每次结果的误差平方和
for k in range(2, 9):
estimator = AgglomerativeClustering(n_clusters = k, linkage = 'ward')
estimator.fit(data)
Scores.append(silhouette_score(
np.array(data), estimator.labels_, metric='euclidean'))
X = range(2,9)
plt.xlabel('k值',fontsize=20)
plt.ylabel('聚类系数',fontsize=20)
plt.plot(X, Scores, 'o-')
plt.show()
k = 4
plt.figure(figsize=(20,15))
model = AgglomerativeClustering(n_clusters = k, linkage = 'ward')
data =Gao_train
model.fit(data) #训练模型
# 详细输出原始数据及其类别
r = pd.concat([data, pd.Series(model.labels_, index = data.index)], axis = 1) #详细输出每个样本对应的类别
from scipy.cluster.hierarchy import linkage,dendrogram
#这里使用scipy的层次聚类函数
Z = linkage(data, method = 'ward', metric = 'euclidean') #谱系聚类图
P = dendrogram(Z, 0) #画谱系聚类图
plt.show()
from scipy.cluster.hierarchy import linkage,dendrogram
plt.figure(figsize=(20,15))
k = 4
model = AgglomerativeClustering(n_clusters = k, linkage = 'ward')
data =Qian_train
model.fit(data) #训练模型
#详细输出原始数据及其类别
r = pd.concat([data, pd.Series(model.labels_, index = data.index)], axis = 1) #详细输出每个样本对应的类别
#这里使用scipy的层次聚类函数
Z = linkage(data, method = 'ward', metric = 'euclidean') #谱系聚类图
P = dendrogram(Z, 0) #画谱系聚类图
plt.show()
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
df1 = Gao_train
data1 = np.array(df1)
clf1 = AgglomerativeClustering(n_clusters = 2, linkage = 'ward')
s = clf1.fit(data1)
pred1 = clf1.fit_predict(data1)
score1 = silhouette_score(data1, pred1)
pca = PCA(n_components=2) # 输出两维
newData1 = pca.fit_transform(data1) # 载入N维
x1, y1 = [], []
x2, y2= [], []
x3, y3= [], []
x4, y4= [], []
x5, y5= [], []
x6, y6= [], []
for index, value in enumerate(pred1):
if value == 0:
x1.append(newData1[index][0])
y1.append(newData1[index][1])
elif value == 1:
x2.append(newData1[index][0])
y2.append(newData1[index][1])
plt.figure(figsize=(12, 8))
k = 50
plt.scatter(x1, y1,s=k)
plt.scatter(x2, y2,s=k)
plt.show()
df2 = Qian_train
data2 = np.array(df2)
clf2 = AgglomerativeClustering(n_clusters = 5, linkage = 'ward')
s = clf2.fit(data2)
pred2 = clf2.fit_predict(data2)
score2 = silhouette_score(data2, pred2)
pca = PCA(n_components=2) # 输出两维
newData2 = pca.fit_transform(data2) # 载入N维
x1, y1 = [], []
x2, y2= [], []
x3, y3= [], []
x4, y4= [], []
x5, y5= [], []
x6, y6= [], []
for index, value in enumerate(pred2):
if value == 0:
x1.append(newData2[index][0])
y1.append(newData2[index][1])
elif value == 1:
x2.append(newData2[index][0])
y2.append(newData2[index][1])
elif value == 2:
x3.append(newData2[index][0])
y3.append(newData2[index][1])
elif value == 3:
x4.append(newData2[index][0])
y4.append(newData2[index][1])
elif value == 4:
x5.append(newData2[index][0])
y5.append(newData2[index][1])
# plt.subplot(132)
plt.figure(figsize=(12, 8))
k = 50
plt.scatter(x1, y1,s=k)
plt.scatter(x2, y2,s=k)
plt.scatter(x3, y3,s=k)
plt.scatter(x4, y4,s=k)
plt.scatter(x5, y5,s=k)
plt.show()
3.问题三
file1 = pd.read_excel('./附件.xlsx',sheet_name='表单1')
file2 = pd.read_excel('./附件.xlsx',sheet_name='表单2')
file3 = pd.read_excel('./附件.xlsx',sheet_name='表单3')
from tqdm import tqdm
import re
tqdm.pandas()
def clear_id(s):
ss = str(s)
n = int(re.findall(r"\d+",ss)[0])
return n
file2['id'] = file2['文物采样点'].progress_apply(clear_id)
file1['id'] = file1['文物编号'].progress_apply(clear_id)
cols = ['表面风化', '二氧化硅(SiO2)',
'氧化钠(Na2O)', '氧化钾(K2O)', '氧化钙(CaO)', '氧化镁(MgO)', '氧化铝(Al2O3)',
'氧化铁(Fe2O3)', '氧化铜(CuO)', '氧化铅(PbO)', '氧化钡(BaO)', '五氧化二磷(P2O5)',
'氧化锶(SrO)', '氧化锡(SnO2)', '二氧化硫(SO2)']
file_data = pd.merge(file1,file2,on='id')
Gao_data = file_data[file_data['类型']=='高钾'][cols]
Qian_data = file_data[file_data['类型']=='铅钡'][cols]
test_all = file3[cols]
Qian_data = Qian_data.fillna(0)
Gao_data = Gao_data.fillna(0)
test_all = test_all.fillna(0)
train_all = pd.concat([Gao_data,Qian_data])
encode_data = pd.concat([train_all,test_all])
len(train_all)
from sklearn.preprocessing import LabelEncoder
all_data = pd.DataFrame() # 用于存放编码后的训练数据
label = LabelEncoder() # 标号编码器
X= encode_data
for c in X.columns: # 对每个特征列
if X[c].dtype=='object': # 如果是字符串形式的(字符串读到pandas里dtype是object)
all_data[c] = label.fit_transform(X[c]) # 将整个这一列进行标号编码,写到新的dataframe里
else: # 其它类型的特征(数值,布尔)保持原样写入
all_data[c] = list(X[c])
train = all_data[0:len(train_all)]
test = all_data[len(train_all):]
from sklearn.model_selection import train_test_split # 拆分数据
import numpy as np
X_var,y_var= train,[0]*len(Gao_data)+[1]*len(Qian_data)
X_train, X_test, y_train, y_test = train_test_split(X_var, y_var, test_size = 0.2, random_state = 123)
from sklearn.tree import DecisionTreeClassifier as dtc # 树算法
from sklearn.metrics import accuracy_score
model = dtc(criterion = 'entropy', max_depth = 5)
model.fit(X_train, y_train)
pred_model = model.predict(X_test)
print(accuracy_score(y_test, pred_model))
from sklearn.tree import plot_tree # 树图
import matplotlib.pyplot as plt
model = dtc(criterion = 'entropy', max_depth = 5)
model.fit(X_var, y_var)
feature_names = X_var.columns
target_names = ['高钾玻璃','铅钡玻璃']
plot_tree(model,
feature_names = feature_names,
class_names = target_names,
filled = True,
rounded = True)
plt.show()
model.predict(test)
4.问题四
file1 = pd.read_excel('./附件.xlsx',sheet_name='表单1')
file2 = pd.read_excel('./附件.xlsx',sheet_name='表单2')
file3 = pd.read_excel('./附件.xlsx',sheet_name='表单3')
from tqdm import tqdm
import re
tqdm.pandas()
def clear_id(s):
ss = str(s)
n = int(re.findall(r"\d+",ss)[0])
return n
file2['id'] = file2['文物采样点'].progress_apply(clear_id)
file1['id'] = file1['文物编号'].progress_apply(clear_id)
cols = ['表面风化', '二氧化硅(SiO2)',
'氧化钠(Na2O)', '氧化钾(K2O)', '氧化钙(CaO)', '氧化镁(MgO)', '氧化铝(Al2O3)',
'氧化铁(Fe2O3)', '氧化铜(CuO)', '氧化铅(PbO)', '氧化钡(BaO)', '五氧化二磷(P2O5)',
'氧化锶(SrO)', '氧化锡(SnO2)', '二氧化硫(SO2)']
file_data = pd.merge(file1,file2,on='id')
Gao_data = file_data[file_data['类型']=='高钾'][cols]
Qian_data = file_data[file_data['类型']=='铅钡'][cols]
Qian_data = Qian_data.fillna(0)
Gao_data = Gao_data.fillna(0)
import seaborn as sns
f, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 8))
df = Gao_data
shifted_cols = df.columns
corrmat = df[shifted_cols].corr()
heatmap = sns.heatmap(corrmat,annot = True,vmax = 1,square = True)
ax.set_title('高钾玻璃化学成分相关性', fontsize=16)
plt.tight_layout()
plt.show()
f, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 8))
df_1 = Qian_data
shifted_cols = df_1.columns
corrmat = df_1[shifted_cols].corr()
heatmap = sns.heatmap(corrmat,annot = True,vmax = 1,square = True)
ax.set_title('铅钡玻璃化学成分相关性', fontsize=10)
plt.tight_layout()
plt.show()
df = Qian_data
shifted_cols = df.columns
corrmat = df[shifted_cols].corr()
sns.pairplot(corrmat)
df = Gao_data
shifted_cols = df.columns
corrmat = df[shifted_cols].corr()
sns.pairplot(corrmat)