import numpy as np
df= pd.DataFrame(columns=['文物编号','风化标记','化学成分含量','化学成分标签','纹饰','类型','颜色','表面风化'],
index=range(d12.shape[0]*14))
df['文物编号']=list(d12['文物编号'])*14
df['风化标记']=list(d12['风化标记'])*14
df['纹饰']=list(d12['纹饰'])*14
df['类型']=list(d12['类型'])*14
df['颜色']=list(d12['颜色'])*14
df['表面风化']=list(d12['表面风化'])*14
df['化学成分标签']=list(np.repeat(list(d12.columns[6:20]), d12.shape[0]))
a =list(d12.iloc[:,6])for i inrange(7,20):
a.extend(d12.iloc[:,i])
df['化学成分含量']= a
df.head()
文物编号
风化标记
化学成分含量
化学成分标签
纹饰
类型
颜色
表面风化
0
1
其它
69.33
二氧化硅(SiO2)
C
高钾
蓝绿
无风化
1
2
其它
36.28
二氧化硅(SiO2)
A
铅钡
浅蓝
风化
2
3
其它
87.05
二氧化硅(SiO2)
A
高钾
蓝绿
无风化
3
3
其它
61.71
二氧化硅(SiO2)
A
高钾
蓝绿
无风化
4
4
其它
65.88
二氧化硅(SiO2)
A
高钾
蓝绿
无风化
数据可视化
import plotly.express as px
fig = px.box(df, x="化学成分标签", y="化学成分含量", color="风化标记")# remove background color
fig.update_layout({'plot_bgcolor':'rgba(0, 0, 0, 0)',})
fig.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
warnings.filterwarnings('ignore')
# 颜色 rows with na# 由于在风化及未风化的玻璃中,均是‘浅蓝’颜色的玻璃频数最高,这里选择使用众数进行缺失值填补
index = pd.isna(df['颜色'])
index = np.where(index)[0]
df.iloc[index,6]='浅蓝'
# Fitting Random Forest Regression to the dataset
RF = RandomForestRegressor(n_estimators=10, random_state=0, oob_score=True)# Fit the regressor with x and y data
RF.fit(X_train, y_train)# Evaluating the modelfrom sklearn.metrics import mean_squared_error, r2_score
# Access the OOB Score
oob_score = RF.oob_score_
print(f'Out-of-Bag Score: {oob_score}')# Making predictions on the same data or new data
predictions = RF.predict(X_test)# Evaluating the model
mse = mean_squared_error(y_test, predictions)print(f'Mean Squared Error: {mse}')
r2 = r2_score(y_test, predictions)print(f'R-squared: {r2}')
Out-of-Bag Score: 0.858876239921834
Mean Squared Error: 18.621843142992024
R-squared: 0.9440368907222724
XGboost回归
import numpy as np
import pandas as pd
import xgboost as xg
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
[14:07:55] WARNING: /workspace/src/objective/regression_obj.cu:167: reg:linear is now deprecated in favor of reg:squarederror.
XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints=None,
learning_rate=0.300000012, max_delta_step=0, max_depth=6,
min_child_weight=1, missing=nan, monotone_constraints=None,
n_estimators=10, n_jobs=0, num_parallel_tree=1,
objective='reg:linear', random_state=123, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=123, subsample=1,
tree_method=None, validate_parameters=False, verbosity=None)
predictions = xgb_r.predict(X_test)# Evaluating the model
mse = mean_squared_error(y_test, predictions)print(f'Mean Squared Error: {mse}')
r2 = r2_score(y_test, predictions)print(f'R-squared: {r2}')
Mean Squared Error: 31.125352266521848
R-squared: 0.9064608440301115
Gradient Boosting回归
#importing libraries import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
GB = GradientBoostingRegressor()
GB.fit(X_train, y_train)# Making predictions on the same data or new data
predictions = GB.predict(X_test)# Evaluating the model
mse = mean_squared_error(y_test, predictions)print(f'Mean Squared Error: {mse}')
r2 = r2_score(y_test, predictions)print(f'R-squared: {r2}')
Mean Squared Error: 44.14340281131733
R-squared: 0.8673384768386802
LightGBM回归
from lightgbm import LGBMRegressor
gbm = LGBMRegressor()
gbm.fit(X_train, y_train)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 750, number of used features: 7
[LightGBM] [Info] Start training from score 6.549480
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
LGBMRegressor()
# Making predictions on the same data or new data
predictions = gbm.predict(X_test)# Evaluating the model
mse = mean_squared_error(y_test, predictions)print(f'Mean Squared Error: {mse}')
r2 = r2_score(y_test, predictions)print(f'R-squared: {r2}')
Mean Squared Error: 38.507655924247906
R-squared: 0.8842752492344551
CatBoost回归
import catboost
from catboost import CatBoostRegressor
cat = CatBoostRegressor(verbose=0, n_estimators=100)
cat.fit(X_train, y_train)# Making predictions on the same data or new data
predictions = cat.predict(X_test)# Evaluating the model
mse = mean_squared_error(y_test, predictions)print(f'Mean Squared Error: {mse}')
r2 = r2_score(y_test, predictions)print(f'R-squared: {r2}')
Mean Squared Error: 19.8730131181204
R-squared: 0.94027682457278
决策树回归
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor(random_state=0)
tree.fit(X_train, y_train)# Making predictions on the same data or new data
predictions = tree.predict(X_test)# Evaluating the model
mse = mean_squared_error(y_test, predictions)print(f'Mean Squared Error: {mse}')
r2 = r2_score(y_test, predictions)print(f'R-squared: {r2}')
Mean Squared Error: 18.104645212765963
R-squared: 0.9455911946687293
MLP回归
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(random_state=0)
mlp.fit(X_train, y_train)# Making predictions on the same data or new data
predictions = mlp.predict(X_test)# Evaluating the model
mse = mean_squared_error(y_test, predictions)print(f'Mean Squared Error: {mse}')
r2 = r2_score(y_test, predictions)print(f'R-squared: {r2}')
Mean Squared Error: 213.8644123666194
R-squared: 0.3572860974079113
预测
# generate test data
df_pred = df_encode[df_encode['表面风化']==1]
df_pred_origin = df[df_encode['表面风化']==1]# 设置风化前的数据
df_pred['表面风化']=0
X_pred = df_pred.drop('化学成分含量', axis=1)
X_pred
# for the samples that have two testing points, we consider the mean of the predicted values
dual =list(df_pred['文物编号'].value_counts().index[0:8])
labels =list(df_pred_origin['化学成分标签'].unique())
pre_mean = pd.DataFrame(columns=['文物编号','化学成分标签','风化前预测'],
index=range(len(dual)*len(labels)))
pre_mean['文物编号']=list(np.repeat(dual,len(labels)))
pre_mean['化学成分标签']= labels *len(dual)for i in dual:
i =int(i)for j in labels:if dual[0]inlist(df_pred_origin['文物编号'].unique()):
index = np.where((df_pred_origin['文物编号']== i)&(df_pred_origin['化学成分标签']== j))[0]
pre_mean.iloc[dual.index(i)*len(labels)+ labels.index(j),2]= df_pred_origin.iloc[index,7].mean()
my_index =[]for i inlist(df_pred_origin.iloc[:,0]):if i notin dual:
my_index.append(i)
df_pred_origin.index =range(df_pred_origin.shape[0])
my_index =[]for i inlist(df_pred_origin.index):if df_pred_origin.iloc[i,0]notin dual:
my_index.append(i)