目录
第一部分 数据探索
1 导入工具包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import make_scorer,mean_squared_error
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
2 导入数据
## 使用Pandas库read_csv()函数进行数据读取,分割符为‘\t’
train_data_file = "C:/Users/Administrator/Downloads/zhengqi_train.txt"
test_data_file = "C:/Users/Administrator/Downloads/zhengqi_test.txt"
data_train = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
data_test = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')
3 查看数据
## 1)查看基本信息
data_train.info()
data_test.info()
## 2)查看统计信息
data_train.describe()
data_test.describe()
## 3)查看字段信息
data_train.head()
data_test.head()
4 异常值可视化
方法一:利用箱形图
## 绘制训练集中变量的箱形图
column = data_train.columns.tolist()[:39] # 列表头
fig = plt.figure(figsize=(20, 40)) # 指定绘图对象宽度和高度
for i in range(38):
plt.subplot(13, 3, i + 1) # 13行3列子图
sns.boxplot(data_train[column[i]], orient="v", width=0.5) # 箱式图
plt.ylabel(column[i], fontsize=8)
plt.show()
方法二:采用模型预测的形式
def rmse(y_true, y_pred):
diff = y_pred - y_true
sum_sq = sum(diff**2)
n = len(y_pred)
return np.sqrt(sum_sq/n)
def mse(y_ture,y_pred):
return mean_squared_error(y_ture,y_pred)
# 基于预测模型检测异常值
def find_outliers(model, X, y, sigma=3):
# predict y values using model
try:
y_pred = pd.Series(model.predict(X), index=y.index)
# if predicting fails, try fitting the model first
except:
model.fit(X,y)
y_pred = pd.Series(model.predict(X), index=y.index)
# calculate residuals between the model prediction and true y values
resid = y - y_pred
mean_resid = resid.mean()
std_resid = resid.std()
# calculate z statistic, define outliers to be where |z|>sigma
z = (resid - mean_resid)/std_resid
outliers = z[abs(z)>sigma].index
# print and plot the results
print('R2=',model.score(X,y))
print('rmse=',rmse(y, y_pred))
print("mse=",mean_squared_error(y,y_pred))
print('---------------------------------------')
print('mean of residuals:',mean_resid)
print('std of residuals:',std_resid)
print('---------------------------------------')
print(len(outliers),'outliers:')
print(outliers.tolist())
plt.figure(figsize=(15,5))
ax_131 = plt.subplot(1,3,1)
plt.plot(y,y_pred,'.')
plt.plot(y.loc[outliers],y_pred.loc[outliers],'ro')
plt.legend(['Accepted','Outlier'])
plt.xlabel('y')
plt.ylabel('y_pred');
ax_132=plt.subplot(1,3,2)
plt.plot(y,y-y_pred,'.')
plt.plot(y.loc[outliers],y.loc[outliers]-y_pred.loc[outliers],'ro')
plt.legend(['Accepted','Outlier'])
plt.xlabel('y')
plt.ylabel('y - y_pred');
ax_133=plt.subplot(1,3,3)
z.plot.hist(bins=50,ax=ax_133)
z.loc[outliers].plot.hist(color='r',bins=50,ax=ax_133)
plt.legend(['Accepted','Outlier'])
plt.xlabel('z')
plt.savefig('outliers.png')
return outliers
## 通过岭回归模型找出异常值,并绘制其分布
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
X_train=data_train.iloc[:,0:-1]
y_train=data_test.iloc[:,-1]
outliers = find_outliers(Ridge(), X_train, y_train)
5 数据分布可视化
## 查看所有数据的直方图和Q-Q图,查看训练集的数据是否近似于正态分布
train_cols = 6
train_rows = len(data_train.columns)
plt.figure(figsize=(4*train_cols,4*train_rows))
i=0
for col in data_train.columns:
i+=1
ax=plt.subplot(train_rows,train_cols,i)
sns.distplot(data_train[col],fit=stats.norm)
i+=1
ax=plt.subplot(train_rows,train_cols,i)
res = stats.probplot(data_train[col], plot=plt)
plt.show()
6 特征分布一致性对比可视化
## 查看所有特征变量下,训练集数据和测试集数据的分布情况,分析并寻找出数据分布不一致的特征变量。
dist_cols = 6
dist_rows = len(data_test.columns)
plt.figure(figsize=(4*dist_cols,4*dist_rows))
i=1
for col in data_test.columns:
ax=plt.subplot(dist_rows,dist_cols,i)
ax = sns.kdeplot(data_train[col], color="Red", shade=True)
ax = sns.kdeplot(data_test[col], color="Blue", shade=True)
ax.set_xlabel(col)
ax.set_ylabel("Frequency")
ax = ax.legend(["train","test"])
i+=1
plt.show()
## 查看特征'V5', 'V17', 'V27', 'V22', 'V11', 'V9'数据的数据分布
drop_col = 6
drop_row = 1
plt.figure(figsize=(5*drop_col,5*drop_row))
i=1
for col in ["V5","V9","V11","V17","V22","V27"]:
ax =plt.subplot(drop_row,drop_col,i)
ax = sns.kdeplot(data_train[col], color="Red", shade=True)
ax = sns.kdeplot(data_test[col], color="Blue", shade=True)
ax.set_xlabel(col)
ax.set_ylabel("Frequency")
ax = ax.legend(["train","test"])
i+=1
plt.show()
## 存储可以删除的分布不一致特征
drop_columns_1 = ['V5','V9','V11','V17','V22','V27']
7 查看特征变量的相关性
data_train_drop = data_train.drop(drop_columns_1,axis=1)
train_corr = data_train_drop.corr()
train_corr
# 画出相关性热力图
ax = plt.subplots(figsize=(20, 16))#调整画布大小
ax = sns.heatmap(train_corr, vmax=.8, square=True, annot=True)#画热力图 annot=True 显示系数
# 找出相关程度
data_train_drop = data_train.drop(drop_columns_1,axis=1)
plt.figure(figsize=(20, 16)) # 指定绘图对象宽度和高度
colnm = data_train_drop.columns.tolist() # 列表头
mcorr = data_train_drop[colnm].corr(method="spearman") # 相关系数矩阵,即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool) # 构造与mcorr同维数矩阵 为bool型
mask[np.triu_indices_from(mask)] = True # 角分线右侧为True
cmap = sns.diverging_palette(220, 10, as_cmap=True) # 返回matplotlib colormap对象
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f') # 热力图(看两两相似度)
plt.show()
第二部分 特征工程
1 合并数据
## 合并训练集和测试集
data_train["oringin"]="train"
data_test["oringin"]="test"
data_all=pd.concat([data_train,data_test],axis=0,ignore_index=True)
2 删除不一致特征
data_all.drop(drop_columns_1,axis=1,inplace=True)
3 数据归一化
## 数据归一化
cols_numeric=list(data_all.columns)
cols_numeric.remove("oringin")
def scale_minmax(col):
return (col-col.min())/(col.max()-col.min())
data_all[cols_numeric] = data_all[cols_numeric].apply(scale_minmax,axis=0)
4 特征Box-Cox变换
## 对特征进行Box-Cox变换,使其满足正态性
cols_transform=data_all.columns[0:-1]
for col in cols_transform:
# transform column
data_all.loc[:,col], _ = stats.boxcox(data_all.loc[:,col]+1)
## 查看变换效果
data_all_cols = 6
data_all_rows = len(data_all.columns[0:-2])
plt.figure(figsize=(4*data_all_cols,4*data_all_rows))
i=0
for col in data_all.columns[0:-2]:
i+=1
ax=plt.subplot(data_all_rows,data_all_cols,i)
sns.distplot(data_all[col],fit=stats.norm)
i+=1
ax=plt.subplot(data_all_rows,data_all_cols,i)
res = stats.probplot(data_all[col], plot=plt)
plt.show()
5 标签数据对数变换
## 标签数据对数变换数据,使数据更符合正态,并画图展示
sp =data_train.target
data_train.targetlog =np.power(1.5,sp)
print(data_train.targetlog.describe())
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
sns.distplot(data_train.targetlog.dropna(),fit=stats.norm);
plt.subplot(1,2,2)
_=stats.probplot(data_train.targetlog.dropna(), plot=plt)
6 提取新训练数据
## 提取训练样本
train_scaler = data_all[data_all["oringin"]=="train"]
train_scaler["label"]=data_train.targetlog
train_scaler=train_scaler.drop(["oringin"],axis=1)
## 分离训练特征和标签
train_scaler_X=train_scaler.drop(["label","target"],axis=1)
train_scaler_y=train_scaler.label
7 特征选择
方法一:VarianceThreshold(数据归一化后不能使用)
## 方差选择法,返回值为特征选择后的数据
## 参数threshold为方差的阈值
from sklearn.feature_selection import VarianceThreshold
VarianceThreshold(threshold=3).fit_tranaform(data_all.drop(["oringin"],axis=1))
方法二:相关系数法
## 相关性初筛
mcorr=mcorr.abs()
numerical_corr=mcorr[mcorr['target']>0.1]['target']
features_corr = numerical_corr.sort_values(ascending=False).reset_index()
features_corr.columns = ['features_and_target', 'corr']
features_corr_select = features_corr[features_corr['corr']>0.1] # 筛选出大于相关性大于0.1的特征
print(features_corr_select)
select_features_corr = [col for col in features_corr_select['features_and_target'] if col not in ['target']]
train_scaler_X_corr_select = train_scaler_X[select_features_corr]
方法三:使用f_regression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
# 再使用SelectKBest转换器类,用卡方函数打分,初始化转换器。
sel = SelectKBest(f_regression, k='all')
sel = sel.fit(data_train_scaler_X, data_train_scaler_y)
# 调用transform()或直接使用fit_transform方法,
# 对相同的数据集进行预处理和转换。
train_sel = sel.transform(data_train_scaler_X)
# 打印出特征提取前后特征数
print('原始特征数:', data_train_scaler_X.shape[1])
print('简化的特征数:', train_sel.shape[1])
plt.bar([i for i in range(len(sel.scores_))], sel.scores_)
plt.xlabel("feature index")
plt.ylabel("F-value (transformed from the correlation values)")
plt.show()
fr_sq = pd.DataFrame(sel.scores_, columns = ["fr_Square"], index=data_train_scaler_X.columns)
fr_sq = fr_sq.reset_index()
fr_sq.sort_values('fr_Square',ascending=0)
方法四:使用互信息
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
# 再使用SelectKBest转换器类,用卡方函数打分,初始化转换器。
sel = SelectKBest(mutual_info_regression, k='all')
sel = sel.fit(data_train_scaler_X, data_train_scaler_y)
# 得分情况
np.set_printoptions(precision=3)
print(sel.scores_)
# 调用transform()或直接使用fit_transform方法,
# 对相同的数据集进行预处理和转换。
train_sel = sel.transform(data_train_scaler_X)
# 打印出特征提取前后特征数
print('原始特征数:', data_train_scaler_X.shape[1])
print('简化的特征数:', train_sel.shape[1])
mit_sq = pd.DataFrame(sel.scores_, columns = ["MIT_Square"], index=data_train_scaler_X.columns)
mit_sq = mit_sq.reset_index()
mit_sq.sort_values('MIT_Square',ascending=0)
plt.bar([i for i in range(len(sel.scores_))], sel.scores_)
plt.xlabel("feature index")
plt.ylabel("Estimated MI value")
plt.show()
方法五:去除多重共线性
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calculate_vif(features):
vif = pd.DataFrame()
vif["index"] = features.columns
vif["VIF"] = [variance_inflation_factor(features.values, i) for i in range(features.shape[1])]
return(vif)
vif = calculate_vif(train_scaler_X_corr_select)
while vif['VIF'][vif['VIF'] > 10].any():
remove = vif.sort_values('VIF',ascending=0)['index'][:1]
train_scaler_X_corr_select.drop(remove,axis=1,inplace=True)
vif = calculate_vif(train_scaler_X_corr_select)
select_features_vif =list(vif["index"])
train_scaler_X_vif_select = train_scaler_X_corr_select[select_features_vif]
方法六:Coupla熵法
方法七:RFE
## 递归特征消除法,返回特征选择后的数据
## 参数estimator为基模型
## 参数n_features_to_select为选择的特征个数
from sklearn.features_selection import RFE
from sklearn.linear_model import LogisticRegression
RFE(eatimator=LogisticRegression(multi_class='auto',solver='lbfgs',max_iter=500),n_features_to_select=2).fit_transform(df_train_X,df_train_y)
方法八:基于惩罚项
## 将带L1惩罚项的逻辑回归作为基模型的特征选择
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
SelectFromModel(LogisticRegression(penalty='l2',C=0.1,solver='lbfgs',multi_class='auto')).fit_transform(df_train_X,df_train_y)
方法九:基于树模型
## 将GBDT作为基模型的特征选择
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingRegressor
df_train_X.shape
clf = GradientBoostingRegressor()
>>> clf = clf.fit(df_train_X, df_train_y)
>>> clf.feature_importances_
>>> model = SelectFromModel(clf, prefit=True)
>>> df_train_X_new = model.transform(df_train_X)
>>> df_train_X_new.shape
## 基于lightgbm进行特征选择
import lightgbm
from sklearn.model_selection import train_test_split
X_train, X_vaild, y_train, y_vaild = train_test_split(data_train_scaler_X,data_train_scaler_y, test_size=0.2, random_state=0)
clf = lightgbm
train_matrix = clf.Dataset(X_train, label=y_train)
vaild_matrix = clf.Dataset(X_vaild, label=y_vaild)
params = {
'boosting_type': 'gbdt',
#'boosting_type': 'dart',
'objective': 'regression',
'min_child_weight': 1.5,
'lambda_l2': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'learning_rate': 0.01,
'seed': 2017,
}
num_round = 10000
early_stopping_rounds = 100
model = clf.train(params,
train_matrix,
num_round,
valid_sets=vaild_matrix,
early_stopping_rounds=early_stopping_rounds)
lgb_feature = pd.DataFrame(model.feature_importance(), columns = ["feature_importance"], index=model.feature_name())
lgb_feature = lgb_feature.reset_index()
lgb_feature.sort_values('feature_importance',ascending=0)
方法十:Boruta算法
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_jobs = -1, max_depth = 5)
boruta = BorutaPy(
estimator = rf,
n_estimators = 'auto',
max_iter = 100 # number of trials to perform
)
# 模型训练
boruta.fit(np.array(data_train_scaler_X), np.array(data_train_scaler_y))
# 输出结果
green_area = data_train_scaler_X.columns[boruta.support_].to_list()
blue_area = data_train_scaler_X.columns[boruta.support_weak_].to_list()
print('features in the green area:', green_area)
print('features in the blue area:', blue_area)
data_all_process=data_all[['V0', 'V1', 'V2', 'V3', 'V6', 'V7', 'V8', 'V10', 'V12', 'V13', 'V16', 'V19', 'V28', 'V31', 'V36', 'V37',"oringin","target"]]
8 特征降维
方法一:主成分分析法
## 主成分分析法,返回降维后的数据
## 参数n_component为主成分的数目
from sklearn.decomposition import PCA
PCA(n_component=2).fit_transform(data_all.drop(["oringin"],axis=1))
方法二:线性判别分析法
## 线性判别分析法,返回降维后的数据
## 参数n_component为降维后的数据
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
LDA(n_component=2).fit_transform(df_train_X,df_train_y)
第三部分 模型训练
1 获取训练、验证以及测试数据
## 获取训练样本
def get_training_data():
# extract training samples
from sklearn.model_selection import train_test_split
df_train = data_all[data_all["oringin"]=="train"]
df_train["label"]=train_scaler_y
# split SalePrice and features
y = df_train.label
X = df_train.drop(["oringin","target","label"],axis=1)
X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=0.2,random_state=100)
return X_train,X_valid,y_train,y_valid
## 提取测试数据(不含目标)
def get_test_data():
df_test = data_all[data_all["oringin"]=="test"].reset_index(drop=True)
return df_test.drop(["oringin","target"],axis=1)
2 定义模型评分函数
from sklearn.metrics import make_scorer
# metric for evaluation
def rmse(y_true, y_pred):
diff = y_pred - y_true
sum_sq = sum(diff**2)
n = len(y_pred)
return np.sqrt(sum_sq/n)
def mse(y_ture,y_pred):
return mean_squared_error(y_ture,y_pred)
# scorer to be used in sklearn model fitting
rmse_scorer = make_scorer(rmse, greater_is_better=False)
mse_scorer = make_scorer(mse, greater_is_better=False)
3 异常值处理
方法一:利用箱线图
plt.figure(figsize=(18, 10))
plt.boxplot(x=data_train.values,labels=data_train.columns)
plt.hlines([-7.5, 7.5], 0, 40, colors='r')
plt.show()
## 把训练集中含有异常值的样本删除
data_train = data_train [data_train ['V9']>-7.5]
data_train .describe()
## 把测试集中含有异常值的样本删除
data_test = data_test[data_test['V9']>-7.5]
data_test.describe()
方法二:利用模型预测
## 基于预测模型检测异常值
def find_outliers(model, X, y, sigma=3):
# predict y values using model
try:
y_pred = pd.Series(model.predict(X), index=y.index)
# if predicting fails, try fitting the model first
except:
model.fit(X,y)
y_pred = pd.Series(model.predict(X), index=y.index)
# calculate residuals between the model prediction and true y values
resid = y - y_pred
mean_resid = resid.mean()
std_resid = resid.std()
# calculate z statistic, define outliers to be where |z|>sigma
z = (resid - mean_resid)/std_resid
outliers = z[abs(z)>sigma].index
# print and plot the results
print('R2=',model.score(X,y))
print('rmse=',rmse(y, y_pred))
print("mse=",mean_squared_error(y,y_pred))
print('---------------------------------------')
print('mean of residuals:',mean_resid)
print('std of residuals:',std_resid)
print('---------------------------------------')
print(len(outliers),'outliers:')
print(outliers.tolist())
plt.figure(figsize=(15,5))
ax_131 = plt.subplot(1,3,1)
plt.plot(y,y_pred,'.')
plt.plot(y.loc[outliers],y_pred.loc[outliers],'ro')
plt.legend(['Accepted','Outlier'])
plt.xlabel('y')
plt.ylabel('y_pred');
ax_132=plt.subplot(1,3,2)
plt.plot(y,y-y_pred,'.')
plt.plot(y.loc[outliers],y.loc[outliers]-y_pred.loc[outliers],'ro')
plt.legend(['Accepted','Outlier'])
plt.xlabel('y')
plt.ylabel('y - y_pred');
ax_133=plt.subplot(1,3,3)
z.plot.hist(bins=50,ax=ax_133)
z.loc[outliers].plot.hist(color='r',bins=50,ax=ax_133)
plt.legend(['Accepted','Outlier'])
plt.xlabel('z')
plt.savefig('outliers.png')
return outliers
## 获取训练数据
from sklearn.linear_model import Ridge
X_train, X_valid,y_train,y_valid = get_training_data()
test=get_test_data()
# find and remove outliers using a Ridge model
outliers = find_outliers(Ridge(), X_train, y_train)
# permanently remove these outliers from the data
#df_train = data_all[data_all["oringin"]=="train"]
#df_train["label"]=data_train.target1
#df_train=df_train.drop(outliers)
X_outliers=X_train.loc[outliers]
y_outliers=y_train.loc[outliers]
X_t=X_train.drop(outliers)
y_t=y_train.drop(outliers)
4 使用删除异常的数据进行模型训练
def get_trainning_data_omitoutliers():
y1=y_t.copy()
X1=X_t.copy()
return X1,y1
5 采用网格搜索训练模型
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score,cross_val_predict,KFold
def train_model(model, param_grid=[], X=[], y=[],
splits=5, repeats=5):
# get unmodified training data, unless data to use already specified
if len(y)==0:
X,y = get_trainning_data_omitoutliers()
#poly_trans=PolynomialFeatures(degree=2)
#X=poly_trans.fit_transform(X)
#X=MinMaxScaler().fit_transform(X)
# create cross-validation method
rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats)
# perform a grid search if param_grid given
if len(param_grid)>0:
# setup grid search parameters
gsearch = GridSearchCV(model, param_grid, cv=rkfold,
scoring="neg_mean_squared_error",
verbose=1, return_train_score=True)
# search the grid
gsearch.fit(X,y)
# extract best model from the grid
model = gsearch.best_estimator_
best_idx = gsearch.best_index_
# get cv-scores for best model
grid_results = pd.DataFrame(gsearch.cv_results_)
cv_mean = abs(grid_results.loc[best_idx,'mean_test_score'])
cv_std = grid_results.loc[best_idx,'std_test_score']
# no grid search, just cross-val score for given model
else:
grid_results = []
cv_results = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=rkfold)
cv_mean = abs(np.mean(cv_results))
cv_std = np.std(cv_results)
# combine mean and std cv-score in to a pandas series
cv_score = pd.Series({'mean':cv_mean,'std':cv_std})
# predict y using the fitted model
y_pred = model.predict(X)
# print stats on model performance
print('----------------------')
print(model)
print('----------------------')
print('score=',model.score(X,y))
print('rmse=',rmse(y, y_pred))
print('mse=',mse(y, y_pred))
print('cross_val: mean=',cv_mean,', std=',cv_std)
# residual plots
y_pred = pd.Series(y_pred,index=y.index)
resid = y - y_pred
mean_resid = resid.mean()
std_resid = resid.std()
z = (resid - mean_resid)/std_resid
n_outliers = sum(abs(z)>3)
plt.figure(figsize=(15,5))
ax_131 = plt.subplot(1,3,1)
plt.plot(y,y_pred,'.')
plt.xlabel('y')
plt.ylabel('y_pred');
plt.title('corr = {:.3f}'.format(np.corrcoef(y,y_pred)[0][1]))
ax_132=plt.subplot(1,3,2)
plt.plot(y,y-y_pred,'.')
plt.xlabel('y')
plt.ylabel('y - y_pred');
plt.title('std resid = {:.3f}'.format(std_resid))
ax_133=plt.subplot(1,3,3)
z.plot.hist(bins=50,ax=ax_133)
plt.xlabel('z')
plt.title('{:.0f} samples with z>3'.format(n_outliers))
return model, cv_score, grid_results
## 储存最优模型和得分
opt_models = dict()
score_models = pd.DataFrame(columns=['mean','std'])
# no. k-fold splits
splits=5
# no. k-fold iterations
repeats=5
6 回归模型
模型一:XGB回归
from xgboost import XGBRegressor
model = 'XGB'
opt_models[model] = XGBRegressor(objective='reg:squarederror')
param_grid = { 'max_depth': [1,2,3],
'learning_rate': [0.2,0.3,0.4],
'n_estimators': [50,100,150],
'gamma':[0,0.05,0.1],
'min_child_weight': [0,1,2],
}
opt_models[model], cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=1)
cv_score.name = model
score_models = score_models.append(cv_score)
from xgboost import plot_importance
plot_importance(opt_models[model])
plt.show()
score_test = mean_squared_error(y_valid, opt_models[model].predict(X_valid))
print("XGBRegressor RandomizedSearchCV test MSE: ", score_test)
模型二:LGB回归
import lightgbm as lgb
model = 'LGB'
opt_models[model] = lgb.LGBMRegressor(objective='regression',boosting_type='gbdt')
param_grid = { 'max_depth': [-2,-1,0,1,2],
'learning_rate': [0.01,0.1,1],
'n_estimators': [200,600,1000,2000,5000],
}
opt_models[model], cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=1)
cv_score.name = model
score_models = score_models.append(cv_score)
模型三:随机森林回归
model = 'RandomForest'
opt_models[model] = RandomForestRegressor()
param_grid = {'n_estimators':[100,150,200],
'max_features':[8,12,16,20,24],
'min_samples_split':[2,4,6]}
opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=5, repeats=1)
cv_score.name = model
score_models = score_models.append(cv_score)
7 学习曲线
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
X = train_data2[test_data2.columns].values
y = train_data2['target'].values
title = "LinearRegression"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = model_selection.ShuffleSplit(X.shape[0], n_splits=100,
test_size=0.2, random_state=0)
estimator = SGDRegressor()
plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=-1)
8 验证曲线
print(__doc__)
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import validation_curve
X = train_data2[test_data2.columns].values
y = train_data2['target'].values
# max_iter=1000, tol=1e-3, penalty= 'L1', alpha=0.00001
param_range = [0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]
train_scores, test_scores = validation_curve(
SGDRegressor(max_iter=1000, tol=1e-3, penalty= 'L1'), X, y, param_name="alpha", param_range=param_range,
cv=10, scoring='r2', n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.title("Validation Curve with SGDRegressor")
plt.xlabel("alpha")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
plt.semilogx(param_range, train_scores_mean, label="Training score", color="r")
plt.fill_between(param_range, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2, color="r")
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
color="g")
plt.fill_between(param_range, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.2, color="g")
plt.legend(loc="best")
plt.show()
第四部分 模型融合
第五部分 实用代码
1 问题:在训练模型 fit(x_train,y_train) 时遇到报错
ValueError: Input contains NaN, infinity or a value too large for dtype('float64')
## 检查是否包含无穷数据
#False:包含
#True:不包含
print(np.isfinite(train).all())
#False:不包含
#True:包含
print(np.isinf(train).all())
#数据处理
train_inf = np.isinf(train)
train[train_inf] = 0
## 检查数据中是否有缺失值,以下两种方式均可
#Flase:对应特征的特征值中无缺失值
#True:有缺失值
print(train.isnull().any())
print(np.isnan(train).any())
#查看缺失值记录
train_null = pd.isnull(train)
train_null = train[train_null == True]
print(train_null)
#缺失值处理,以下两种方式均可
#删除包含缺失值的行
train.dropna(inplace=True)
#缺失值填充
train.fillna('100')
2 python打印输出显示不全,是省略号的解决办法
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)