赛题一工业蒸汽量预测

最新推荐文章于 2024-07-07 21:28:34 发布

别辜负眼前

最新推荐文章于 2024-07-07 21:28:34 发布

阅读量876

点赞数

分类专栏：数据建模

本文链接：https://blog.csdn.net/qq_42577307/article/details/118730862

版权

数据建模专栏收录该内容

2 篇文章 0 订阅

订阅专栏

第一部分数据探索

1 导入工具包

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

from sklearn.metrics import make_scorer,mean_squared_error

import warnings
warnings.filterwarnings("ignore")
 
%matplotlib inline

2 导入数据

## 使用Pandas库read_csv()函数进行数据读取，分割符为‘\t’
train_data_file = "C:/Users/Administrator/Downloads/zhengqi_train.txt"
test_data_file =  "C:/Users/Administrator/Downloads/zhengqi_test.txt"

data_train = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
data_test = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')

3 查看数据

## 1）查看基本信息
data_train.info()
data_test.info()

## 2）查看统计信息
data_train.describe()
data_test.describe()

## 3)查看字段信息
data_train.head()
data_test.head()

4 异常值可视化

方法一：利用箱形图

## 绘制训练集中变量的箱形图
column = data_train.columns.tolist()[:39]  # 列表头
fig = plt.figure(figsize=(20, 40))  # 指定绘图对象宽度和高度
for i in range(38):
    plt.subplot(13, 3, i + 1)  # 13行3列子图
    sns.boxplot(data_train[column[i]], orient="v", width=0.5)  # 箱式图
    plt.ylabel(column[i], fontsize=8)
plt.show()

方法二：采用模型预测的形式

def rmse(y_true, y_pred):
    diff = y_pred - y_true
    sum_sq = sum(diff**2)    
    n = len(y_pred)   
    
    return np.sqrt(sum_sq/n)
def mse(y_ture,y_pred):
    return mean_squared_error(y_ture,y_pred)

# 基于预测模型检测异常值
def find_outliers(model, X, y, sigma=3):

    # predict y values using model
    try:
        y_pred = pd.Series(model.predict(X), index=y.index)
    # if predicting fails, try fitting the model first
    except:
        model.fit(X,y)
        y_pred = pd.Series(model.predict(X), index=y.index)
        
    # calculate residuals between the model prediction and true y values
    resid = y - y_pred
    mean_resid = resid.mean()
    std_resid = resid.std()

    # calculate z statistic, define outliers to be where |z|>sigma
    z = (resid - mean_resid)/std_resid    
    outliers = z[abs(z)>sigma].index
    
    # print and plot the results
    print('R2=',model.score(X,y))
    print('rmse=',rmse(y, y_pred))
    print("mse=",mean_squared_error(y,y_pred))
    print('---------------------------------------')

    print('mean of residuals:',mean_resid)
    print('std of residuals:',std_resid)
    print('---------------------------------------')

    print(len(outliers),'outliers:')
    print(outliers.tolist())

    plt.figure(figsize=(15,5))
    ax_131 = plt.subplot(1,3,1)
    plt.plot(y,y_pred,'.')
    plt.plot(y.loc[outliers],y_pred.loc[outliers],'ro')
    plt.legend(['Accepted','Outlier'])
    plt.xlabel('y')
    plt.ylabel('y_pred');

    ax_132=plt.subplot(1,3,2)
    plt.plot(y,y-y_pred,'.')
    plt.plot(y.loc[outliers],y.loc[outliers]-y_pred.loc[outliers],'ro')
    plt.legend(['Accepted','Outlier'])
    plt.xlabel('y')
    plt.ylabel('y - y_pred');

    ax_133=plt.subplot(1,3,3)
    z.plot.hist(bins=50,ax=ax_133)
    z.loc[outliers].plot.hist(color='r',bins=50,ax=ax_133)
    plt.legend(['Accepted','Outlier'])
    plt.xlabel('z')
    
    plt.savefig('outliers.png')
    
    return outliers

## 通过岭回归模型找出异常值，并绘制其分布
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
X_train=data_train.iloc[:,0:-1]
y_train=data_test.iloc[:,-1]
outliers = find_outliers(Ridge(), X_train, y_train)

5 数据分布可视化

## 查看所有数据的直方图和Q-Q图，查看训练集的数据是否近似于正态分布
train_cols = 6
train_rows = len(data_train.columns)
plt.figure(figsize=(4*train_cols,4*train_rows))

i=0
for col in data_train.columns:
    i+=1
    ax=plt.subplot(train_rows,train_cols,i)
    sns.distplot(data_train[col],fit=stats.norm)
    
    i+=1
    ax=plt.subplot(train_rows,train_cols,i)
    res = stats.probplot(data_train[col], plot=plt)
plt.show()

6 特征分布一致性对比可视化

## 查看所有特征变量下，训练集数据和测试集数据的分布情况，分析并寻找出数据分布不一致的特征变量。
dist_cols = 6
dist_rows = len(data_test.columns)
plt.figure(figsize=(4*dist_cols,4*dist_rows))

i=1
for col in data_test.columns:
    ax=plt.subplot(dist_rows,dist_cols,i)
    ax = sns.kdeplot(data_train[col], color="Red", shade=True)
    ax = sns.kdeplot(data_test[col], color="Blue", shade=True)
    ax.set_xlabel(col)
    ax.set_ylabel("Frequency")
    ax = ax.legend(["train","test"])
    
    i+=1
plt.show()

## 查看特征'V5', 'V17', 'V27', 'V22', 'V11', 'V9'数据的数据分布
drop_col = 6
drop_row = 1

plt.figure(figsize=(5*drop_col,5*drop_row))

i=1
for col in ["V5","V9","V11","V17","V22","V27"]:
    ax =plt.subplot(drop_row,drop_col,i)
    ax = sns.kdeplot(data_train[col], color="Red", shade=True)
    ax = sns.kdeplot(data_test[col], color="Blue", shade=True)
    ax.set_xlabel(col)
    ax.set_ylabel("Frequency")
    ax = ax.legend(["train","test"])
    
    i+=1
plt.show()

## 存储可以删除的分布不一致特征
drop_columns_1 = ['V5','V9','V11','V17','V22','V27']

7 查看特征变量的相关性

data_train_drop = data_train.drop(drop_columns_1,axis=1)
train_corr = data_train_drop.corr()
train_corr

# 画出相关性热力图
ax = plt.subplots(figsize=(20, 16))#调整画布大小

ax = sns.heatmap(train_corr, vmax=.8, square=True, annot=True)#画热力图   annot=True 显示系数

# 找出相关程度
data_train_drop = data_train.drop(drop_columns_1,axis=1)

plt.figure(figsize=(20, 16))  # 指定绘图对象宽度和高度
colnm = data_train_drop.columns.tolist()  # 列表头
mcorr = data_train_drop[colnm].corr(method="spearman")  # 相关系数矩阵，即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool)  # 构造与mcorr同维数矩阵 为bool型
mask[np.triu_indices_from(mask)] = True  # 角分线右侧为True
cmap = sns.diverging_palette(220, 10, as_cmap=True)  # 返回matplotlib colormap对象
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')  # 热力图（看两两相似度）
plt.show()

第二部分特征工程

1 合并数据

## 合并训练集和测试集
data_train["oringin"]="train"
data_test["oringin"]="test"
data_all=pd.concat([data_train,data_test],axis=0,ignore_index=True)

2 删除不一致特征

data_all.drop(drop_columns_1,axis=1,inplace=True)

3 数据归一化

## 数据归一化
cols_numeric=list(data_all.columns)
cols_numeric.remove("oringin")
def scale_minmax(col):
    return (col-col.min())/(col.max()-col.min())
data_all[cols_numeric] = data_all[cols_numeric].apply(scale_minmax,axis=0)

4 特征Box-Cox变换

## 对特征进行Box-Cox变换，使其满足正态性
cols_transform=data_all.columns[0:-1]
for col in cols_transform:   
    # transform column
    data_all.loc[:,col], _ = stats.boxcox(data_all.loc[:,col]+1)

## 查看变换效果
data_all_cols = 6
data_all_rows = len(data_all.columns[0:-2])
plt.figure(figsize=(4*data_all_cols,4*data_all_rows))

i=0
for col in data_all.columns[0:-2]:
    i+=1
    ax=plt.subplot(data_all_rows,data_all_cols,i)
    sns.distplot(data_all[col],fit=stats.norm)
    
    i+=1
    ax=plt.subplot(data_all_rows,data_all_cols,i)
    res = stats.probplot(data_all[col], plot=plt)
plt.show()

5 标签数据对数变换

## 标签数据对数变换数据，使数据更符合正态，并画图展示
sp =data_train.target
data_train.targetlog =np.power(1.5,sp)
print(data_train.targetlog.describe())

plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
sns.distplot(data_train.targetlog.dropna(),fit=stats.norm);
plt.subplot(1,2,2)
_=stats.probplot(data_train.targetlog.dropna(), plot=plt)

6 提取新训练数据

## 提取训练样本
train_scaler = data_all[data_all["oringin"]=="train"]
train_scaler["label"]=data_train.targetlog
train_scaler=train_scaler.drop(["oringin"],axis=1)

## 分离训练特征和标签
train_scaler_X=train_scaler.drop(["label","target"],axis=1)
train_scaler_y=train_scaler.label

7 特征选择

方法一：VarianceThreshold（数据归一化后不能使用）

## 方差选择法,返回值为特征选择后的数据
## 参数threshold为方差的阈值
from sklearn.feature_selection import VarianceThreshold
VarianceThreshold(threshold=3).fit_tranaform(data_all.drop(["oringin"],axis=1))

方法二：相关系数法

## 相关性初筛
mcorr=mcorr.abs()
numerical_corr=mcorr[mcorr['target']>0.1]['target']
features_corr = numerical_corr.sort_values(ascending=False).reset_index()
features_corr.columns = ['features_and_target', 'corr']
features_corr_select = features_corr[features_corr['corr']>0.1] # 筛选出大于相关性大于0.1的特征
print(features_corr_select)
select_features_corr = [col for col in features_corr_select['features_and_target'] if col not in ['target']]
train_scaler_X_corr_select = train_scaler_X[select_features_corr]

方法三：使用f_regression

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

# 再使用SelectKBest转换器类，用卡方函数打分，初始化转换器。
sel = SelectKBest(f_regression, k='all')
sel = sel.fit(data_train_scaler_X, data_train_scaler_y)

# 调用transform()或直接使用fit_transform方法，
# 对相同的数据集进行预处理和转换。
train_sel = sel.transform(data_train_scaler_X)
# 打印出特征提取前后特征数
print('原始特征数:', data_train_scaler_X.shape[1])
print('简化的特征数:', train_sel.shape[1])

plt.bar([i for i in range(len(sel.scores_))], sel.scores_)
plt.xlabel("feature index")
plt.ylabel("F-value (transformed from the correlation values)")
plt.show()

fr_sq = pd.DataFrame(sel.scores_, columns = ["fr_Square"], index=data_train_scaler_X.columns)
fr_sq = fr_sq.reset_index()
fr_sq.sort_values('fr_Square',ascending=0)

方法四：使用互信息

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression

# 再使用SelectKBest转换器类，用卡方函数打分，初始化转换器。
sel = SelectKBest(mutual_info_regression, k='all')
sel = sel.fit(data_train_scaler_X, data_train_scaler_y)

# 得分情况
np.set_printoptions(precision=3)
print(sel.scores_)

# 调用transform()或直接使用fit_transform方法，
# 对相同的数据集进行预处理和转换。
train_sel = sel.transform(data_train_scaler_X)
# 打印出特征提取前后特征数
print('原始特征数:', data_train_scaler_X.shape[1])
print('简化的特征数:', train_sel.shape[1])

mit_sq = pd.DataFrame(sel.scores_, columns = ["MIT_Square"], index=data_train_scaler_X.columns)
mit_sq = mit_sq.reset_index()
mit_sq.sort_values('MIT_Square',ascending=0)

plt.bar([i for i in range(len(sel.scores_))], sel.scores_)
plt.xlabel("feature index")
plt.ylabel("Estimated MI value")
plt.show()

方法五：去除多重共线性

from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(features):
    vif = pd.DataFrame()
    vif["index"] = features.columns
    vif["VIF"] = [variance_inflation_factor(features.values, i) for i in range(features.shape[1])]    
    return(vif)

vif = calculate_vif(train_scaler_X_corr_select)

while vif['VIF'][vif['VIF'] > 10].any():
    remove = vif.sort_values('VIF',ascending=0)['index'][:1]
    train_scaler_X_corr_select.drop(remove,axis=1,inplace=True)
    vif = calculate_vif(train_scaler_X_corr_select)

select_features_vif =list(vif["index"]) 
train_scaler_X_vif_select = train_scaler_X_corr_select[select_features_vif]

方法六：Coupla熵法

方法七：RFE

## 递归特征消除法，返回特征选择后的数据
## 参数estimator为基模型
## 参数n_features_to_select为选择的特征个数
from sklearn.features_selection import RFE
from sklearn.linear_model import LogisticRegression

RFE(eatimator=LogisticRegression(multi_class='auto',solver='lbfgs',max_iter=500),n_features_to_select=2).fit_transform(df_train_X,df_train_y)

方法八：基于惩罚项

## 将带L1惩罚项的逻辑回归作为基模型的特征选择
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

SelectFromModel(LogisticRegression(penalty='l2',C=0.1,solver='lbfgs',multi_class='auto')).fit_transform(df_train_X,df_train_y)

方法九：基于树模型

## 将GBDT作为基模型的特征选择
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingRegressor
df_train_X.shape

clf = GradientBoostingRegressor()
>>> clf = clf.fit(df_train_X, df_train_y)
>>> clf.feature_importances_  

>>> model = SelectFromModel(clf, prefit=True)
>>> df_train_X_new = model.transform(df_train_X)
>>> df_train_X_new.shape

## 基于lightgbm进行特征选择
import lightgbm
from sklearn.model_selection import train_test_split

X_train, X_vaild, y_train, y_vaild = train_test_split(data_train_scaler_X,data_train_scaler_y, test_size=0.2, random_state=0)


clf = lightgbm

train_matrix = clf.Dataset(X_train, label=y_train)
vaild_matrix = clf.Dataset(X_vaild, label=y_vaild)

params = {
          'boosting_type': 'gbdt',
          #'boosting_type': 'dart',
          'objective': 'regression',
          'min_child_weight': 1.5,
          'lambda_l2': 10,
          'subsample': 0.7,
          'colsample_bytree': 0.7,
          'colsample_bylevel': 0.7,
          'learning_rate': 0.01,
          'seed': 2017,
          }
num_round = 10000
early_stopping_rounds = 100
model = clf.train(params, 
                  train_matrix,
                  num_round,
                  valid_sets=vaild_matrix,
                  early_stopping_rounds=early_stopping_rounds)

lgb_feature = pd.DataFrame(model.feature_importance(), columns = ["feature_importance"], index=model.feature_name())
lgb_feature = lgb_feature.reset_index()
lgb_feature.sort_values('feature_importance',ascending=0)

方法十：Boruta算法

from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
 
rf = RandomForestRegressor(n_jobs = -1, max_depth = 5)
boruta = BorutaPy(
   estimator = rf, 
   n_estimators = 'auto',
   max_iter = 100 # number of trials to perform
)

# 模型训练
boruta.fit(np.array(data_train_scaler_X), np.array(data_train_scaler_y))
# 输出结果
green_area = data_train_scaler_X.columns[boruta.support_].to_list()
blue_area  = data_train_scaler_X.columns[boruta.support_weak_].to_list()
print('features in the green area:', green_area)
print('features in the blue area:', blue_area)

data_all_process=data_all[['V0', 'V1', 'V2', 'V3', 'V6', 'V7', 'V8', 'V10', 'V12', 'V13', 'V16', 'V19', 'V28', 'V31', 'V36', 'V37',"oringin","target"]]

8 特征降维

方法一：主成分分析法

## 主成分分析法，返回降维后的数据
## 参数n_component为主成分的数目
from sklearn.decomposition import PCA
PCA(n_component=2).fit_transform(data_all.drop(["oringin"],axis=1))

方法二：线性判别分析法

## 线性判别分析法，返回降维后的数据
## 参数n_component为降维后的数据
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
LDA(n_component=2).fit_transform(df_train_X,df_train_y)

第三部分模型训练

1 获取训练、验证以及测试数据

## 获取训练样本
def get_training_data():
    # extract training samples
    from sklearn.model_selection import train_test_split
    df_train = data_all[data_all["oringin"]=="train"]
    df_train["label"]=train_scaler_y

    # split SalePrice and features
    y = df_train.label
    X = df_train.drop(["oringin","target","label"],axis=1)
    X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=0.2,random_state=100)
    return X_train,X_valid,y_train,y_valid

## 提取测试数据（不含目标）
def get_test_data():
    df_test = data_all[data_all["oringin"]=="test"].reset_index(drop=True)
    return df_test.drop(["oringin","target"],axis=1)

2 定义模型评分函数

from sklearn.metrics import make_scorer
# metric for evaluation
def rmse(y_true, y_pred):
    diff = y_pred - y_true
    sum_sq = sum(diff**2)    
    n = len(y_pred)   
    
    return np.sqrt(sum_sq/n)
def mse(y_ture,y_pred):
    return mean_squared_error(y_ture,y_pred)

# scorer to be used in sklearn model fitting
rmse_scorer = make_scorer(rmse, greater_is_better=False)
mse_scorer = make_scorer(mse, greater_is_better=False)

3 异常值处理

方法一：利用箱线图

plt.figure(figsize=(18, 10))
plt.boxplot(x=data_train.values,labels=data_train.columns)
plt.hlines([-7.5, 7.5], 0, 40, colors='r')
plt.show()

## 把训练集中含有异常值的样本删除
data_train = data_train [data_train ['V9']>-7.5]
data_train .describe()

## 把测试集中含有异常值的样本删除
data_test = data_test[data_test['V9']>-7.5]
data_test.describe()

方法二：利用模型预测

## 基于预测模型检测异常值
def find_outliers(model, X, y, sigma=3):

    # predict y values using model
    try:
        y_pred = pd.Series(model.predict(X), index=y.index)
    # if predicting fails, try fitting the model first
    except:
        model.fit(X,y)
        y_pred = pd.Series(model.predict(X), index=y.index)
        
    # calculate residuals between the model prediction and true y values
    resid = y - y_pred
    mean_resid = resid.mean()
    std_resid = resid.std()

    # calculate z statistic, define outliers to be where |z|>sigma
    z = (resid - mean_resid)/std_resid    
    outliers = z[abs(z)>sigma].index
    
    # print and plot the results
    print('R2=',model.score(X,y))
    print('rmse=',rmse(y, y_pred))
    print("mse=",mean_squared_error(y,y_pred))
    print('---------------------------------------')

    print('mean of residuals:',mean_resid)
    print('std of residuals:',std_resid)
    print('---------------------------------------')

    print(len(outliers),'outliers:')
    print(outliers.tolist())

    plt.figure(figsize=(15,5))
    ax_131 = plt.subplot(1,3,1)
    plt.plot(y,y_pred,'.')
    plt.plot(y.loc[outliers],y_pred.loc[outliers],'ro')
    plt.legend(['Accepted','Outlier'])
    plt.xlabel('y')
    plt.ylabel('y_pred');

    ax_132=plt.subplot(1,3,2)
    plt.plot(y,y-y_pred,'.')
    plt.plot(y.loc[outliers],y.loc[outliers]-y_pred.loc[outliers],'ro')
    plt.legend(['Accepted','Outlier'])
    plt.xlabel('y')
    plt.ylabel('y - y_pred');

    ax_133=plt.subplot(1,3,3)
    z.plot.hist(bins=50,ax=ax_133)
    z.loc[outliers].plot.hist(color='r',bins=50,ax=ax_133)
    plt.legend(['Accepted','Outlier'])
    plt.xlabel('z')
    
    plt.savefig('outliers.png')
    
    return outliers

## 获取训练数据
from sklearn.linear_model import Ridge
X_train, X_valid,y_train,y_valid = get_training_data()
test=get_test_data()

# find and remove outliers using a Ridge model
outliers = find_outliers(Ridge(), X_train, y_train)

# permanently remove these outliers from the data
#df_train = data_all[data_all["oringin"]=="train"]
#df_train["label"]=data_train.target1
#df_train=df_train.drop(outliers)
X_outliers=X_train.loc[outliers]
y_outliers=y_train.loc[outliers]
X_t=X_train.drop(outliers)
y_t=y_train.drop(outliers)

4 使用删除异常的数据进行模型训练

def get_trainning_data_omitoutliers():
    y1=y_t.copy()
    X1=X_t.copy()
    return X1,y1

5 采用网格搜索训练模型

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score,cross_val_predict,KFold
def train_model(model, param_grid=[], X=[], y=[], 
                splits=5, repeats=5):

    # get unmodified training data, unless data to use already specified
    if len(y)==0:
        X,y = get_trainning_data_omitoutliers()
        #poly_trans=PolynomialFeatures(degree=2)
        #X=poly_trans.fit_transform(X)
        #X=MinMaxScaler().fit_transform(X)
    
    # create cross-validation method
    rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats)
    
    # perform a grid search if param_grid given
    if len(param_grid)>0:
        # setup grid search parameters
        gsearch = GridSearchCV(model, param_grid, cv=rkfold,
                               scoring="neg_mean_squared_error",
                               verbose=1, return_train_score=True)

        # search the grid
        gsearch.fit(X,y)

        # extract best model from the grid
        model = gsearch.best_estimator_        
        best_idx = gsearch.best_index_

        # get cv-scores for best model
        grid_results = pd.DataFrame(gsearch.cv_results_)       
        cv_mean = abs(grid_results.loc[best_idx,'mean_test_score'])
        cv_std = grid_results.loc[best_idx,'std_test_score']

    # no grid search, just cross-val score for given model    
    else:
        grid_results = []
        cv_results = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=rkfold)
        cv_mean = abs(np.mean(cv_results))
        cv_std = np.std(cv_results)
    
    # combine mean and std cv-score in to a pandas series
    cv_score = pd.Series({'mean':cv_mean,'std':cv_std})

    # predict y using the fitted model
    y_pred = model.predict(X)
    
    # print stats on model performance         
    print('----------------------')
    print(model)
    print('----------------------')
    print('score=',model.score(X,y))
    print('rmse=',rmse(y, y_pred))
    print('mse=',mse(y, y_pred))
    print('cross_val: mean=',cv_mean,', std=',cv_std)
    
    # residual plots
    y_pred = pd.Series(y_pred,index=y.index)
    resid = y - y_pred
    mean_resid = resid.mean()
    std_resid = resid.std()
    z = (resid - mean_resid)/std_resid    
    n_outliers = sum(abs(z)>3)
    
    plt.figure(figsize=(15,5))
    ax_131 = plt.subplot(1,3,1)
    plt.plot(y,y_pred,'.')
    plt.xlabel('y')
    plt.ylabel('y_pred');
    plt.title('corr = {:.3f}'.format(np.corrcoef(y,y_pred)[0][1]))
    ax_132=plt.subplot(1,3,2)
    plt.plot(y,y-y_pred,'.')
    plt.xlabel('y')
    plt.ylabel('y - y_pred');
    plt.title('std resid = {:.3f}'.format(std_resid))
    
    ax_133=plt.subplot(1,3,3)
    z.plot.hist(bins=50,ax=ax_133)
    plt.xlabel('z')
    plt.title('{:.0f} samples with z>3'.format(n_outliers))

    return model, cv_score, grid_results

## 储存最优模型和得分
opt_models = dict()
score_models = pd.DataFrame(columns=['mean','std'])

# no. k-fold splits
splits=5
# no. k-fold iterations
repeats=5

6 回归模型

模型一：XGB回归

from xgboost import XGBRegressor
model = 'XGB'
opt_models[model] = XGBRegressor(objective='reg:squarederror')

param_grid = { 'max_depth': [1,2,3],
              'learning_rate': [0.2,0.3,0.4],
              'n_estimators': [50,100,150],
              'gamma':[0,0.05,0.1],
              'min_child_weight': [0,1,2],
             }

opt_models[model], cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid, 
                                              splits=splits, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)

from xgboost import plot_importance
plot_importance(opt_models[model])
plt.show()

score_test = mean_squared_error(y_valid, opt_models[model].predict(X_valid))
print("XGBRegressor RandomizedSearchCV test MSE:   ", score_test)

模型二：LGB回归

import lightgbm as lgb
model = 'LGB'
opt_models[model] = lgb.LGBMRegressor(objective='regression',boosting_type='gbdt')

param_grid = { 'max_depth': [-2,-1,0,1,2],
              'learning_rate': [0.01,0.1,1],
              'n_estimators': [200,600,1000,2000,5000],
             }

opt_models[model], cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid, 
                                              splits=splits, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)

模型三：随机森林回归

model = 'RandomForest'
opt_models[model] = RandomForestRegressor()

param_grid = {'n_estimators':[100,150,200],
              'max_features':[8,12,16,20,24],
              'min_samples_split':[2,4,6]}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, 
                                              splits=5, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)

7 学习曲线

print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection 
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import learning_curve


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


X = train_data2[test_data2.columns].values
y = train_data2['target'].values


title = "LinearRegression"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = model_selection.ShuffleSplit(X.shape[0], n_splits=100,
                                   test_size=0.2, random_state=0)

estimator = SGDRegressor()
plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=-1)

8 验证曲线

print(__doc__)

import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import validation_curve

X = train_data2[test_data2.columns].values
y = train_data2['target'].values
# max_iter=1000, tol=1e-3, penalty= 'L1', alpha=0.00001

param_range = [0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]
train_scores, test_scores = validation_curve(
    SGDRegressor(max_iter=1000, tol=1e-3, penalty= 'L1'), X, y, param_name="alpha", param_range=param_range,
    cv=10, scoring='r2', n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with SGDRegressor")
plt.xlabel("alpha")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
plt.semilogx(param_range, train_scores_mean, label="Training score", color="r")
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2, color="r")
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="g")
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2, color="g")
plt.legend(loc="best")
plt.show()

第四部分模型融合

第五部分实用代码

1 问题：在训练模型 fit(x_train,y_train) 时遇到报错

ValueError: Input contains NaN, infinity or a value too large for dtype('float64')

## 检查是否包含无穷数据
#False:包含
#True:不包含
print(np.isfinite(train).all())
#False:不包含
#True:包含
print(np.isinf(train).all())
 
#数据处理
train_inf = np.isinf(train)
train[train_inf] = 0

## 检查数据中是否有缺失值，以下两种方式均可
#Flase:对应特征的特征值中无缺失值
#True：有缺失值
print(train.isnull().any())
print(np.isnan(train).any())
 
#查看缺失值记录
train_null = pd.isnull(train)
train_null = train[train_null == True]
print(train_null)
 
#缺失值处理，以下两种方式均可
#删除包含缺失值的行
train.dropna(inplace=True)
#缺失值填充
train.fillna('100')

2 python打印输出显示不全，是省略号的解决办法

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)