2022高教社杯全国大学生数学建模竞赛C题问题一(3) Python代码

data learning

已于 2024-09-17 21:12:59 修改

阅读量381

点赞数 10

分类专栏：数学建模文章标签：数学建模 python 回归机器学习数据可视化

于 2024-09-17 19:42:41 首次发布

本文链接：https://blog.csdn.net/weixin_45481473/article/details/142316231

版权

数学建模专栏收录该内容

8 篇文章 1 订阅

订阅专栏

1.3 根据风化点检测数据，预测其风化前的化学成分含量

在这里插入图片描述

数据重塑

import numpy as np

df= pd.DataFrame(columns=['文物编号','风化标记', '化学成分含量', '化学成分标签',
                         '纹饰','类型','颜色','表面风化'], 
                 index=range(d12.shape[0]*14))
df['文物编号'] = list(d12['文物编号']) * 14
df['风化标记'] = list(d12['风化标记']) * 14
df['纹饰'] = list(d12['纹饰']) * 14
df['类型'] = list(d12['类型']) * 14
df['颜色'] = list(d12['颜色']) * 14
df['表面风化'] = list(d12['表面风化']) * 14
df['化学成分标签'] = list(np.repeat(list(d12.columns[6:20]), d12.shape[0]))

a = list(d12.iloc[:,6])
for i in range(7,20):
    a.extend(d12.iloc[:,i])
df['化学成分含量'] = a
df.head()

	文物编号	风化标记	化学成分含量	化学成分标签	纹饰	类型	颜色	表面风化
0	1	其它	69.33	二氧化硅(SiO2)	C	高钾	蓝绿	无风化
1	2	其它	36.28	二氧化硅(SiO2)	A	铅钡	浅蓝	风化
2	3	其它	87.05	二氧化硅(SiO2)	A	高钾	蓝绿	无风化
3	3	其它	61.71	二氧化硅(SiO2)	A	高钾	蓝绿	无风化
4	4	其它	65.88	二氧化硅(SiO2)	A	高钾	蓝绿	无风化

数据可视化

import plotly.express as px
fig = px.box(df, x="化学成分标签", y="化学成分含量", color="风化标记")
# remove background color
fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)',})
fig.show()

在这里插入图片描述

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings
 
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
 
warnings.filterwarnings('ignore')

# 颜色 rows with na
# 由于在风化及未风化的玻璃中，均是‘浅蓝’颜色的玻璃频数最高，这里选择使用众数进行缺失值填补
index = pd.isna(df['颜色'])
index = np.where(index)[0]
df.iloc[index,6] = '浅蓝'

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 938 entries, 0 to 937
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   文物编号    938 non-null    int64  
 1   风化标记    938 non-null    object 
 2   化学成分含量  938 non-null    float64
 3   化学成分标签  938 non-null    object 
 4   纹饰      938 non-null    object 
 5   类型      938 non-null    object 
 6   颜色      938 non-null    object 
 7   表面风化    938 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 58.8+ KB

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# reorder columns
df = df.iloc[:,[0,2,1] + list(range(3,8))]

# data encode
# Check for and handle categorical variables
label_encoder = LabelEncoder()
x_categorical = df.select_dtypes(include=['object']).apply(label_encoder.fit_transform)
x_numerical = df.select_dtypes(exclude=['object']).values

df_encode = pd.concat([pd.DataFrame(x_numerical), x_categorical], axis=1)
df_encode.rename(columns = {0:'文物编号', 1:'化学成分含量'}, 
                 inplace = True)
df_encode.head()

	文物编号	化学成分含量	风化标记	纹饰	类型	颜色	表面风化
0	1.0	69.33	1	2	1	6	0
1	2.0	36.28	1	0	0	1	1
2	3.0	87.05	1	0	1	6	0
3	3.0	61.71	1	0	1	6	0
4	4.0	65.88	1	0	1	6	0

X = df_encode.drop('化学成分含量', axis=1)
y = df_encode['化学成分含量']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                 test_size=0.2, random_state=1)

回归

随机森林回归

https://www.geeksforgeeks.org/random-forest-regression-in-python/

# Fitting Random Forest Regression to the dataset
RF = RandomForestRegressor(n_estimators=10, random_state=0, oob_score=True)
 
# Fit the regressor with x and y data
RF.fit(X_train, y_train)

# Evaluating the model
from sklearn.metrics import mean_squared_error, r2_score
 
# Access the OOB Score
oob_score = RF.oob_score_
print(f'Out-of-Bag Score: {oob_score}')
 
# Making predictions on the same data or new data
predictions = RF.predict(X_test)
 
# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
 
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

Out-of-Bag Score: 0.858876239921834
Mean Squared Error: 18.621843142992024
R-squared: 0.9440368907222724

XGboost回归

import numpy as np 
import pandas as pd 
import xgboost as xg 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE

xgb_r = xg.XGBRegressor(objective ='reg:linear', 
                  n_estimators = 10, seed = 123)
xgb_r.fit(X_train, y_train)

[14:07:55] WARNING: /workspace/src/objective/regression_obj.cu:167: reg:linear is now deprecated in favor of reg:squarederror.





XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=10, n_jobs=0, num_parallel_tree=1,
             objective='reg:linear', random_state=123, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=123, subsample=1,
             tree_method=None, validate_parameters=False, verbosity=None)

predictions = xgb_r.predict(X_test) 

# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
 
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

Mean Squared Error: 31.125352266521848
R-squared: 0.9064608440301115

Gradient Boosting回归

#importing libraries  
import pandas as pd 
import numpy as np 
import seaborn as sb 
import matplotlib.pyplot as plt 
import lightgbm as lgb 

from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingRegressor

GB = GradientBoostingRegressor()
GB.fit(X_train, y_train)

# Making predictions on the same data or new data
predictions = GB.predict(X_test)
 
# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
 
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

Mean Squared Error: 44.14340281131733
R-squared: 0.8673384768386802

LightGBM回归

from lightgbm import LGBMRegressor

gbm = LGBMRegressor()
gbm.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 750, number of used features: 7
[LightGBM] [Info] Start training from score 6.549480
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf

LGBMRegressor()

# Making predictions on the same data or new data
predictions = gbm.predict(X_test)
 
# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
 
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

Mean Squared Error: 38.507655924247906
R-squared: 0.8842752492344551

CatBoost回归

import catboost
from catboost import CatBoostRegressor

cat = CatBoostRegressor(verbose=0, n_estimators=100)
cat.fit(X_train, y_train)

# Making predictions on the same data or new data
predictions = cat.predict(X_test)
 
# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
 
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

Mean Squared Error: 19.8730131181204
R-squared: 0.94027682457278

决策树回归

from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor(random_state=0)
tree.fit(X_train, y_train)

# Making predictions on the same data or new data
predictions = tree.predict(X_test)
 
# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
 
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

Mean Squared Error: 18.104645212765963
R-squared: 0.9455911946687293

MLP回归

from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(random_state=0)
mlp.fit(X_train, y_train)

# Making predictions on the same data or new data
predictions = mlp.predict(X_test)
 
# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
 
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

Mean Squared Error: 213.8644123666194
R-squared: 0.3572860974079113

预测

# generate test data
df_pred = df_encode[df_encode['表面风化'] == 1]
df_pred_origin = df[df_encode['表面风化'] == 1]

# 设置风化前的数据
df_pred['表面风化'] = 0
X_pred = df_pred.drop('化学成分含量', axis=1)
X_pred

	文物编号	风化标记	化学成分标签	纹饰	类型	颜色	表面风化
1	2.0	1	0	0	0	1	0
8	7.0	1	0	1	1	6	0
9	8.0	1	0	2	0	4	0
10	8.0	0	0	2	0	4	0
11	9.0	1	0	1	1	6	0
...	...	...	...	...	...	...	...
932	54.0	1	1	2	0	1	0
933	54.0	0	1	2	0	1	0
935	56.0	1	1	2	0	6	0
936	57.0	1	1	2	0	6	0
937	58.0	1	1	2	0	1	0

588 rows × 7 columns

predictions = RF.predict(X_pred)

df_pred_origin = df_pred_origin.drop('表面风化', axis=1)
df_pred_origin['风化前预测'] = predictions
df_pred_origin.head()

	文物编号	化学成分含量	风化标记	化学成分标签	纹饰	类型	颜色	风化前预测
1	2	36.28	其它	二氧化硅(SiO2)	A	铅钡	浅蓝	41.3435
8	7	92.63	其它	二氧化硅(SiO2)	B	高钾	蓝绿	60.9480
9	8	20.14	其它	二氧化硅(SiO2)	C	铅钡	紫	37.7020
10	8	4.61	严重风化点	二氧化硅(SiO2)	C	铅钡	紫	28.9610
11	9	95.02	其它	二氧化硅(SiO2)	B	高钾	蓝绿	64.2920

数据复原

# for the samples that have two testing points, we consider the mean of the predicted values
dual = list(df_pred['文物编号'].value_counts().index[0:8])
labels = list(df_pred_origin['化学成分标签'].unique())
pre_mean = pd.DataFrame(columns=['文物编号','化学成分标签','风化前预测'], 
                 index=range(len(dual) * len(labels)))
pre_mean['文物编号'] = list(np.repeat(dual, len(labels)))
pre_mean['化学成分标签'] = labels * len(dual)

for i in dual:
    i = int(i)
    for j in labels:
        if dual[0] in list(df_pred_origin['文物编号'].unique()):
            index = np.where((df_pred_origin['文物编号'] == i) & (df_pred_origin['化学成分标签'] == j))[0]
            pre_mean.iloc[dual.index(i)*len(labels) + labels.index(j),2] = df_pred_origin.iloc[index,7].mean()

my_index = []
for i in list(df_pred_origin.iloc[:,0]):
    if i not in dual:
        my_index.append(i)

df_pred_origin.index = range(df_pred_origin.shape[0])
my_index = []
for i in list(df_pred_origin.index):
    if df_pred_origin.iloc[i,0] not in dual:
        my_index.append(i)

df_pred_origin_sub1 = df_pred_origin.iloc[my_index,[0,3,7]]
df_pred_origin_sub1.head()

	文物编号	化学成分标签	风化前预测
0	2	二氧化硅(SiO2)	41.3435
1	7	二氧化硅(SiO2)	60.9480
4	9	二氧化硅(SiO2)	64.2920
5	10	二氧化硅(SiO2)	64.2120
6	11	二氧化硅(SiO2)	41.2345

pre_mean.head()

	文物编号	化学成分标签	风化前预测
0	50.0	二氧化硅(SiO2)	51.22975
1	50.0	氧化钠(Na2O)	0.694
2	50.0	氧化钾(K2O)	0.033
3	50.0	氧化钙(CaO)	2.6215
4	50.0	氧化镁(MgO)	0.7615

df_merge = pd.concat([df_pred_origin_sub1, pre_mean])
df_merge = df_merge.sort_values(['化学成分标签', '文物编号'], ascending=[True, True])
df_merge

	文物编号	化学成分标签	风化前预测
0	2.0	二氧化硅(SiO2)	41.3435
1	7.0	二氧化硅(SiO2)	60.948
28	8.0	二氧化硅(SiO2)	33.3315
4	9.0	二氧化硅(SiO2)	64.292
5	10.0	二氧化硅(SiO2)	64.212
...	...	...	...
204	53.0	氧化镁(MgO)	1.02
60	54.0	氧化镁(MgO)	0.8755
207	56.0	氧化镁(MgO)	0.0
208	57.0	氧化镁(MgO)	0.0
209	58.0	氧化镁(MgO)	0.743

476 rows × 3 columns

nrow = len(df_merge['文物编号'].unique())
ncol = len(df_merge['化学成分标签'].unique())

df_shape = np.array(df_merge['风化前预测']).reshape(ncol, nrow)
df_results = pd.DataFrame(np.transpose(df_shape), 
             columns = list(df_merge.iloc[:,1].unique()),
             index = list(df_merge.iloc[:,0].unique()))

columns_order = list(d2.columns)[1:15]
df_results = df_results.reindex(columns=columns_order)

for i in range(df_results.shape[0]):
    df_results.iloc[i,:] = df_results.iloc[i,:] / list(df_results.sum(axis=1) / 100)[i]

df_results.to_csv('/home/shiyu/Desktop/path_acdemic/ant/数模/历年题目/2022/output/df_results.csv', index=True)