import pandas as pd
import numpy as np
data = pd.read_excel(r"D:\学习\技术转移研究前沿\论文\预处理6_ROE.xlsx")
data.info()
from sklearn.impute import SimpleImputer
qage = data.loc[:,"qage"].values.reshape(-1,1)
imp_mean = SimpleImputer().fit_transform(qage)
data.loc[:,"qage"] = imp_mean
tobinQ = data.loc[:,"tobinQ"].values.reshape(-1,1)
imp_mean = SimpleImputer().fit_transform(tobinQ)
data.loc[:,"tobinQ"] = imp_mean
leverage = data.loc[:,"leverage(资产负债率)"].values.reshape(-1,1)
imp_mean = SimpleImputer().fit_transform(leverage)
data.loc[:,"leverage(资产负债率)"] = imp_mean
data.isnull().sum()
total = data.loc[:,"total"].values.reshape(-1,1)
imp_mean = SimpleImputer().fit_transform(total)
data.loc[:,"total"] = imp_mean
for i in list(data.columns)[3:14]:
t = data.loc[:,i].values.reshape(-1,1)
imp_0 = SimpleImputer(strategy="constant",fill_value=0).fit_transform(t)
data.loc[:,i] = imp_0
data=data.replace('男',0)
data=data.replace('女',1)
data.info()
data['Age'] = np.float64(data['Age'])
data['Degree'] = np.float64(data['Degree'])
data['patents'] = np.float64(data['patents'])
m=[]
s=[]
for i in list(data.columns)[0:15]:
m_l = np.mean(data[i])
s_l = np.std(data[i])
m.append(m_l)
s.append(s_l)
print(m)
print(s)
from xgboost import XGBRegressor as XGBR
from xgboost import XGBClassifier as XGBC
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import matplotlib.pyplot as plt
x = data.iloc[:,1:17]
y = data.iloc[:,0]
x_train,x_test,y_train,y_test = TTS(x,y,test_size = 0.3,random_state = 420)
clf = XGBR(n_estimators = 100).fit(x_train,y_train)
clf.predict(x_test)
clf.score(x,y)
clf.score(x_test,y_test)
MSE(y_test,clf.predict(x_test))
clf.feature_importances_
x.info()
MSE(y,clf.predict(x))
cv = KFold(n_splits=5, shuffle = True, random_state=42)
axisx = range(1000,10000,1000)
rs = []
for i in axisx:
reg = XGBR(n_estimators = i,random_state = 420)
rs.append(CVS(reg,x_train,y_train,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()
axisx = range(10,50,2)
rs = []
var = []
ge = []
for i in axisx:
reg = XGBR(n_estimators = i,random_state = 420)
cvresult = CVS(reg,x_train,y_train,cv=cv)
rs.append(cvresult.mean())
var.append(cvresult.var())
ge.append((1-cvresult.mean())**2 + cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],min(ge))
axisx = np.linspace(0,1,20)
rs = []
for i in axisx:
reg = XGBR(n_estimators = 1000,subsample = i,random_state=420)
rs.append(CVS(reg,x_train,y_train,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="green",label = "XGB")
plt.legend()
plt.show()
axisx = np.linspace(0,1,20)
rs = []
for i in axisx:
reg = XGBR(n_estimators = 1000,subsample = i,random_state=420)
rs.append(CVS(reg,x_train,y_train,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="green",label = "XGB")
plt.legend()
plt.show()
reg = XGBR(n_estimators = 100).fit(x_train,y_train)
reg = XGBR(n_estimators=1000,subsample=0.842,learning_rate=0.1,max_depth=16).fit(x_train,y_train)
reg.score(x,y)
reg.score(x_test,y_test)
MSE(y,reg.predict(x))
reg.feature_importances_
importances=reg.feature_importances_
features = x.columns
indices = np.argsort(importances[0:16])
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
from matplotlib import pyplot as plt
from pdpbox import pdp, get_dataset, info_plots
feature_names = [i for i in x.columns]
pdp_goals = pdp.pdp_isolate(model=reg, dataset=x_test, model_features=feature_names, feature='Degree')
pdp.pdp_plot(pdp_goals, 'Degree')
plt.show()