2022高教社杯全国大学生数学建模竞赛C题 问题一(3) Python代码

1.3 根据风化点检测数据,预测其风化前的化学成分含量

在这里插入图片描述

数据重塑

import numpy as np

df= pd.DataFrame(columns=['文物编号','风化标记', '化学成分含量', '化学成分标签',
                         '纹饰','类型','颜色','表面风化'], 
                 index=range(d12.shape[0]*14))
df['文物编号'] = list(d12['文物编号']) * 14
df['风化标记'] = list(d12['风化标记']) * 14
df['纹饰'] = list(d12['纹饰']) * 14
df['类型'] = list(d12['类型']) * 14
df['颜色'] = list(d12['颜色']) * 14
df['表面风化'] = list(d12['表面风化']) * 14
df['化学成分标签'] = list(np.repeat(list(d12.columns[6:20]), d12.shape[0]))

a = list(d12.iloc[:,6])
for i in range(7,20):
    a.extend(d12.iloc[:,i])
df['化学成分含量'] = a
df.head()
文物编号风化标记化学成分含量化学成分标签纹饰类型颜色表面风化
01其它69.33二氧化硅(SiO2)C高钾蓝绿无风化
12其它36.28二氧化硅(SiO2)A铅钡浅蓝风化
23其它87.05二氧化硅(SiO2)A高钾蓝绿无风化
33其它61.71二氧化硅(SiO2)A高钾蓝绿无风化
44其它65.88二氧化硅(SiO2)A高钾蓝绿无风化

数据可视化

import plotly.express as px
fig = px.box(df, x="化学成分标签", y="化学成分含量", color="风化标记")
# remove background color
fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)',})
fig.show()

在这里插入图片描述

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings
 
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
 
warnings.filterwarnings('ignore')
# 颜色 rows with na
# 由于在风化及未风化的玻璃中,均是‘浅蓝’颜色的玻璃频数最高,这里选择使用众数进行缺失值填补
index = pd.isna(df['颜色'])
index = np.where(index)[0]
df.iloc[index,6] = '浅蓝'
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 938 entries, 0 to 937
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   文物编号    938 non-null    int64  
 1   风化标记    938 non-null    object 
 2   化学成分含量  938 non-null    float64
 3   化学成分标签  938 non-null    object 
 4   纹饰      938 non-null    object 
 5   类型      938 non-null    object 
 6   颜色      938 non-null    object 
 7   表面风化    938 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 58.8+ KB
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# reorder columns
df = df.iloc[:,[0,2,1] + list(range(3,8))]

# data encode
# Check for and handle categorical variables
label_encoder = LabelEncoder()
x_categorical = df.select_dtypes(include=['object']).apply(label_encoder.fit_transform)
x_numerical = df.select_dtypes(exclude=['object']).values

df_encode = pd.concat([pd.DataFrame(x_numerical), x_categorical], axis=1)
df_encode.rename(columns = {0:'文物编号', 1:'化学成分含量'}, 
                 inplace = True)
df_encode.head()
文物编号化学成分含量风化标记化学成分标签纹饰类型颜色表面风化
01.069.33102160
12.036.28100011
23.087.05100160
33.061.71100160
44.065.88100160
X = df_encode.drop('化学成分含量', axis=1)
y = df_encode['化学成分含量']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                 test_size=0.2, random_state=1)

回归

随机森林回归

https://www.geeksforgeeks.org/random-forest-regression-in-python/

# Fitting Random Forest Regression to the dataset
RF = RandomForestRegressor(n_estimators=10, random_state=0, oob_score=True)
 
# Fit the regressor with x and y data
RF.fit(X_train, y_train)

# Evaluating the model
from sklearn.metrics import mean_squared_error, r2_score
 
# Access the OOB Score
oob_score = RF.oob_score_
print(f'Out-of-Bag Score: {oob_score}')
 
# Making predictions on the same data or new data
predictions = RF.predict(X_test)
 
# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
 
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')
Out-of-Bag Score: 0.858876239921834
Mean Squared Error: 18.621843142992024
R-squared: 0.9440368907222724
XGboost回归
import numpy as np 
import pandas as pd 
import xgboost as xg 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE 
xgb_r = xg.XGBRegressor(objective ='reg:linear', 
                  n_estimators = 10, seed = 123)
xgb_r.fit(X_train, y_train) 
[14:07:55] WARNING: /workspace/src/objective/regression_obj.cu:167: reg:linear is now deprecated in favor of reg:squarederror.





XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=10, n_jobs=0, num_parallel_tree=1,
             objective='reg:linear', random_state=123, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=123, subsample=1,
             tree_method=None, validate_parameters=False, verbosity=None)
predictions = xgb_r.predict(X_test) 

# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
 
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')
Mean Squared Error: 31.125352266521848
R-squared: 0.9064608440301115
Gradient Boosting回归
#importing libraries  
import pandas as pd 
import numpy as np 
import seaborn as sb 
import matplotlib.pyplot as plt 
import lightgbm as lgb 

from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import GradientBoostingRegressor

GB = GradientBoostingRegressor()
GB.fit(X_train, y_train)

# Making predictions on the same data or new data
predictions = GB.predict(X_test)
 
# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
 
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')
Mean Squared Error: 44.14340281131733
R-squared: 0.8673384768386802
LightGBM回归
from lightgbm import LGBMRegressor

gbm = LGBMRegressor()
gbm.fit(X_train, y_train)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 750, number of used features: 7
[LightGBM] [Info] Start training from score 6.549480
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf

LGBMRegressor()
# Making predictions on the same data or new data
predictions = gbm.predict(X_test)
 
# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
 
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')
Mean Squared Error: 38.507655924247906
R-squared: 0.8842752492344551
CatBoost回归
import catboost
from catboost import CatBoostRegressor

cat = CatBoostRegressor(verbose=0, n_estimators=100)
cat.fit(X_train, y_train)

# Making predictions on the same data or new data
predictions = cat.predict(X_test)
 
# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
 
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')
Mean Squared Error: 19.8730131181204
R-squared: 0.94027682457278
决策树回归
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor(random_state=0)
tree.fit(X_train, y_train)

# Making predictions on the same data or new data
predictions = tree.predict(X_test)
 
# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
 
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')
Mean Squared Error: 18.104645212765963
R-squared: 0.9455911946687293
MLP回归
from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(random_state=0)
mlp.fit(X_train, y_train)

# Making predictions on the same data or new data
predictions = mlp.predict(X_test)
 
# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
 
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')
Mean Squared Error: 213.8644123666194
R-squared: 0.3572860974079113

预测

# generate test data
df_pred = df_encode[df_encode['表面风化'] == 1]
df_pred_origin = df[df_encode['表面风化'] == 1]

# 设置风化前的数据
df_pred['表面风化'] = 0
X_pred = df_pred.drop('化学成分含量', axis=1)
X_pred
文物编号风化标记化学成分标签纹饰类型颜色表面风化
12.0100010
87.0101160
98.0102040
108.0002040
119.0101160
........................
93254.0112010
93354.0012010
93556.0112060
93657.0112060
93758.0112010

588 rows × 7 columns

predictions = RF.predict(X_pred)
df_pred_origin = df_pred_origin.drop('表面风化', axis=1)
df_pred_origin['风化前预测'] = predictions
df_pred_origin.head()
文物编号化学成分含量风化标记化学成分标签纹饰类型颜色风化前预测
1236.28其它二氧化硅(SiO2)A铅钡浅蓝41.3435
8792.63其它二氧化硅(SiO2)B高钾蓝绿60.9480
9820.14其它二氧化硅(SiO2)C铅钡37.7020
1084.61严重风化点二氧化硅(SiO2)C铅钡28.9610
11995.02其它二氧化硅(SiO2)B高钾蓝绿64.2920

数据复原

# for the samples that have two testing points, we consider the mean of the predicted values
dual = list(df_pred['文物编号'].value_counts().index[0:8])
labels = list(df_pred_origin['化学成分标签'].unique())
pre_mean = pd.DataFrame(columns=['文物编号','化学成分标签','风化前预测'], 
                 index=range(len(dual) * len(labels)))
pre_mean['文物编号'] = list(np.repeat(dual, len(labels)))
pre_mean['化学成分标签'] = labels * len(dual)

for i in dual:
    i = int(i)
    for j in labels:
        if dual[0] in list(df_pred_origin['文物编号'].unique()):
            index = np.where((df_pred_origin['文物编号'] == i) & (df_pred_origin['化学成分标签'] == j))[0]
            pre_mean.iloc[dual.index(i)*len(labels) + labels.index(j),2] = df_pred_origin.iloc[index,7].mean()
my_index = []
for i in list(df_pred_origin.iloc[:,0]):
    if i not in dual:
        my_index.append(i)
df_pred_origin.index = range(df_pred_origin.shape[0])
my_index = []
for i in list(df_pred_origin.index):
    if df_pred_origin.iloc[i,0] not in dual:
        my_index.append(i)
df_pred_origin_sub1 = df_pred_origin.iloc[my_index,[0,3,7]]
df_pred_origin_sub1.head()
文物编号化学成分标签风化前预测
02二氧化硅(SiO2)41.3435
17二氧化硅(SiO2)60.9480
49二氧化硅(SiO2)64.2920
510二氧化硅(SiO2)64.2120
611二氧化硅(SiO2)41.2345
pre_mean.head()
文物编号化学成分标签风化前预测
050.0二氧化硅(SiO2)51.22975
150.0氧化钠(Na2O)0.694
250.0氧化钾(K2O)0.033
350.0氧化钙(CaO)2.6215
450.0氧化镁(MgO)0.7615
df_merge = pd.concat([df_pred_origin_sub1, pre_mean])
df_merge = df_merge.sort_values(['化学成分标签', '文物编号'], ascending=[True, True])
df_merge
文物编号化学成分标签风化前预测
02.0二氧化硅(SiO2)41.3435
17.0二氧化硅(SiO2)60.948
288.0二氧化硅(SiO2)33.3315
49.0二氧化硅(SiO2)64.292
510.0二氧化硅(SiO2)64.212
............
20453.0氧化镁(MgO)1.02
6054.0氧化镁(MgO)0.8755
20756.0氧化镁(MgO)0.0
20857.0氧化镁(MgO)0.0
20958.0氧化镁(MgO)0.743

476 rows × 3 columns

nrow = len(df_merge['文物编号'].unique())
ncol = len(df_merge['化学成分标签'].unique())

df_shape = np.array(df_merge['风化前预测']).reshape(ncol, nrow)
df_results = pd.DataFrame(np.transpose(df_shape), 
             columns = list(df_merge.iloc[:,1].unique()),
             index = list(df_merge.iloc[:,0].unique()))
columns_order = list(d2.columns)[1:15]
df_results = df_results.reindex(columns=columns_order)
for i in range(df_results.shape[0]):
    df_results.iloc[i,:] = df_results.iloc[i,:] / list(df_results.sum(axis=1) / 100)[i]
df_results.to_csv('/home/shiyu/Desktop/path_acdemic/ant/数模/历年题目/2022/output/df_results.csv', index=True) 

相关阅读:

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值