Datawhale 零基础入门数据挖掘学习笔记-Task4 建模调参
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
In [3]:
Train_feature = pd.read_csv(‘Train_data_for_tree0401.csv’)
In [39]:
Train_feature = Train_feature.dropna().replace(’-’, 0).reset_index(drop=True)
Train_feature[‘notRepairedDamage’] = Train_feature[‘notRepairedDamage’].astype(np.float32)
X_col = [x for x in Train_feature.columns if x not in [‘price’]]
train_X = Train_feature[X_col]
train_y = Train_feature[‘price’]
In [40]:
model = LinearRegression(normalize=True)
model = model.fit(train_X, train_y)
‘intercept:’+ str(model.intercept_)
sorted(dict(zip(X_col, model.coef_)).items(), key=lambda x:x[1], reverse=True)
Out[40]:
[(‘v_11’, 6084774067.661221),
(‘v_12’, 1169211727.8637226),
(‘v_3’, 173943710.44341362),
(‘v_5’, 44382791.808509886),
(‘v_8’, 12467577.867135908),
(‘v_9’, 11234931.297970595),
(‘v_7’, 7708480.710945234),
(‘v_6’, 6590435.539064932),
(‘v_13’, 3516420.4394493876),
(‘gearbox’, 1417.567278200749),
(‘bodyType’, 83.74840612200833),
(‘model’, 2.694897505827548),
(‘power’, 1.9980947074346471),
(‘car_age’, 0.59427659408091),
(‘model_price_std’, 0.49898155646803527),
(‘model_price_average’, 0.36565495442082746),
(‘model_price_median’, 0.24026516099814824),
(‘model_amount’, 0.23546828914098453),
(‘model_price_max’, 0.0009470776233357769),
(‘name’, 0.0007322030459381923),
(‘model_price_sum’, -4.332819172574833e-05),
(‘SaleID’, -5.766021820881294e-05),
(‘model_price_min’, -0.04082794066722729),
(‘brand’, -21.528854138945846),
(‘fuelType’, -122.39211758279552),
(‘kilometer’, -293.73564674023186),
(‘notRepairedDamage’, -340.2009693374889),
(‘v_14’, -20539.141913513475),
(‘v_4’, -1289190.5738253484),
(‘v_0’, -407454622.87835795),
(‘v_2’, -3400744561.3454585),
(‘v_10’, -7844761209.690098),
(‘v_1’, -9389946210.982061)]
In [41]:
subsample_index = np.random.randint(low=0, high=len(train_y), size=50)
In [42]:
plt.scatter(train_X[‘v_9’][subsample_index], train_y[subsample_index], color=‘c’)
plt.scatter(train_X[‘v_9’][subsample_index], model.predict(train_X.loc[subsample_index]), color=‘orange’)
plt.xlabel(‘v_9’)
plt.ylabel(‘price’)
plt.legend([‘True Price’,‘Predicted Price’],loc=‘upper right’)
print(‘The predicted price is obvious different from true price’)
plt.show()
The predicted price is obvious different from true price
In [43]:
train_y_ln = np.log(train_y + 1)
In [44]:
model = model.fit(train_X, train_y_ln)
print(‘intercept:’+ str(model.intercept_))
sorted(dict(zip(X_col, model.coef_)).items(), key=lambda x:x[1], reverse=True)
intercept:-121313.87402487465
Out[44]:
[(‘v_1’, 79743.7474652253),
(‘v_10’, 65711.48505158548),
(‘v_2’, 32497.906882219868),
(‘v_0’, 2768.076046750281),
(‘v_5’, 518.8742878167941),
(‘v_9’, 138.20541397987182),
(‘v_8’, 99.108982161397),
(‘v_7’, 89.4930244557067),
(‘v_4’, 51.869935725691704),
(‘v_14’, 1.8528477443407432),
(‘gearbox’, 0.04749290715678731),
(‘model’, 0.00020572763586231547),
(‘power’, 8.740174236930652e-05),
(‘model_price_std’, 1.3685948433111726e-05),
(‘car_age’, 8.92963334278759e-06),
(‘model_amount’, 8.251809483472434e-06),
(‘model_price_median’, 1.3110722198524063e-06),
(‘model_price_max’, 1.569588084024212e-07),
(‘SaleID’, 8.454695238489206e-09),
(‘model_price_sum’, -2.024231713146628e-09),
(‘name’, -4.4636808347839946e-08),
(‘model_price_min’, -2.8219462851048217e-06),
(‘model_price_average’, -6.097114531010631e-06),
(‘bodyType’, -0.0003714019605091863),
(‘brand’, -0.0015827534313286556),
(‘fuelType’, -0.003235623622271413),
(‘kilometer’, -0.009841833981445187),
(‘notRepairedDamage’, -0.25069569496152877),
(‘v_13’, -108.70078635196288),
(‘v_6’, -165.5207984415195),
(‘v_3’, -2289.9312995114688),
(‘v_12’, -12405.934285194795),
(‘v_11’, -54366.023020928704)]
In [45]:
plt.scatter(train_X[‘v_9’][subsample_index], train_y[subsample_index], color=‘c’)
plt.scatter(train_X[‘v_9’][subsample_index], np.exp(model.predict(train_X.loc[subsample_index])), color=‘orange’)
plt.xlabel(‘v_9’)
plt.ylabel(‘price’)
plt.legend([‘True Price’,‘Predicted Price’],loc=‘upper right’)
print(‘The predicted price seems normal after np.log transforming’)
plt.show()
The predicted price seems normal after np.log transforming
In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, make_scorer
In [19]:
def log_transfer(func):
def wrapper(y, yhat):
result = func(np.log(y), np.nan_to_num(np.log(yhat)))
return result
return wrapper
In [20]:
scores = cross_val_score(model, X=train_X, y=train_y, verbose=1, cv = 5, scoring=make_scorer(log_transfer(mean_absolute_error)))
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: RuntimeWarning: invalid value encountered in log
This is separate from the ipykernel package so we can avoid doing imports until
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: RuntimeWarning: invalid value encountered in log
This is separate from the ipykernel package so we can avoid doing imports until
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: RuntimeWarning: invalid value encountered in log
This is separate from the ipykernel package so we can avoid doing imports until
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: RuntimeWarning: invalid value encountered in log
This is separate from the ipykernel package so we can avoid doing imports until
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: RuntimeWarning: invalid value encountered in log
This is separate from the ipykernel package so we can avoid doing imports until
[Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 0.6s finished
In [21]:
print(‘AVG:’, np.mean(scores))
AVG: 1.4161313776402409
In [22]:
scores = cross_val_score(model, X=train_X, y=train_y_ln, verbose=1, cv = 5, scoring=make_scorer(mean_absolute_error))
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 0.6s finished
In [23]:
print(‘AVG:’, np.mean(scores))
AVG: 0.19400747527820822
In [24]:
scores = pd.DataFrame(scores.reshape(1,-1))
scores.columns = [‘cv’ + str(x) for x in range(1, 6)]
scores.index = [‘MAE’]
scores
Out[24]:
cv1
cv2
cv3
cv4
cv5
MAE
0.192398
0.194724
0.195045
0.192539
0.195331
In [25]:
from sklearn.model_selection import learning_curve, validation_curve
In [26]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,n_jobs=1, train_size=np.linspace(.1, 1.0, 5 )):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel(‘Training example’)
plt.ylabel(‘score’)
train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_size, scoring = make_scorer(mean_absolute_error))
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color=“r”)
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1,
color=“g”)
plt.plot(train_sizes, train_scores_mean, ‘o-’, color=‘r’,
label=“Training score”)
plt.plot(train_sizes, test_scores_mean,‘o-’,color=“g”,
label=“Cross-validation score”)
plt.legend(loc=“best”)
return plt
In [27]:
plot_learning_curve(LinearRegression(), ‘Liner_model’, train_X[:1000], train_y_ln[:1000], ylim=(0.0, 0.5), cv=5, n_jobs=1)
Out[27]:
<module ‘matplotlib.pyplot’ from ‘C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py’>
In [46]:
Train_feature = Train_feature.dropna()
train_X = Train_feature[X_col]
train_y = Train_feature[‘price’]
train_y_ln = np.log(train_y + 1)
In [47]:
from sklearn.ensemble import RandomForestRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm.sklearn import LGBMRegressor
In [48]:
models = [RandomForestRegressor(),
XGBRegressor(n_estimators = 100, objective=‘reg:squarederror’),
LGBMRegressor(n_estimators = 100)]
In [49]:
result = dict()
for model in models:
model_name = str(model).split(’(’)[0]
scores = cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error))
result[model_name] = scores
print(model_name + ’ is finished’)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
“10 in version 0.20 to 100 in 0.22.”, FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
“10 in version 0.20 to 100 in 0.22.”, FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
“10 in version 0.20 to 100 in 0.22.”, FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
“10 in version 0.20 to 100 in 0.22.”, FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
“10 in version 0.20 to 100 in 0.22.”, FutureWarning)
RandomForestRegressor is finished
XGBRegressor is finished
LGBMRegressor is finished
In [50]:
result = pd.DataFrame(result)
result.index = [‘cv’ + str(x) for x in range(1, 6)]
result
Out[50]:
RandomForestRegressor
XGBRegressor
LGBMRegressor
cv1
0.137174
0.138117
0.138876
cv2
0.136303
0.135521
0.141031
cv3
0.135740
0.134830
0.140041
cv4
0.134971
0.133926
0.138857
cv5
0.138563
0.138735
0.139936
In [53]:
LGB的参数集合:
objective = [‘regression’, ‘regression_l1’, ‘mape’, ‘huber’, ‘fair’]
num_leaves = [3,5,10,15,20,40, 55]
max_depth = [3,5,10,15,20,40, 55]
bagging_fraction = []
feature_fraction = []
drop_rate = []
In [51]:
from sklearn.model_selection import GridSearchCV
In [54]:
parameters = {‘objective’: objective , ‘num_leaves’: num_leaves, ‘max_depth’: max_depth}
model = LGBMRegressor()
clf = GridSearchCV(model, parameters, cv=5)
clf = clf.fit(train_X, train_y)
In [55]:
clf.best_params_
Out[55]:
{‘max_depth’: 20, ‘num_leaves’: 55, ‘objective’: ‘regression’}
In [56]:
model = LGBMRegressor(objective=‘regression’,
num_leaves=55,
max_depth=20)
In [57]:
np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
Out[57]:
0.133177329214146