直接导入相关的库
import xgboost as xgb
import pandas as pd
import numpy as np
import pickle
import sys
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from scipy.sparse import csr_matrix, hstack
from sklearn.cross_validation import KFold, train_test_split
from xgboost import XGBRegressor
from scipy import stats
import seaborn as sns
from copy import deepcopy
%matplotlib inline
# This may raise an exception in earlier versions of Jupyter
%config InlineBackend.figure_format = 'retina'
在这一部分,需要做一个简短的数据探索,看看有什么样的数据集,以及是否能找到其中的任何模式。
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.shape
(188318, 132)
188k训练实例,132列 数据量还可以。
这个数据有116个种类属性(如它们的名字所示)和14个连续(数字)属性。
此外,还有ID和赔偿。总计为132列。
train.describe()
正如我们看到的,所有的连续的功能已被缩放到[0,1]区间,均值基本为0.5。其实数据已经被预处理了,我们拿到的是特征数据。
查看缺失值
pd.isnull(train).values.any()
False
此数据集没有缺失值
这里可以看到float64(15), int64(1)是数据的连续值,有116个object是类别数据
可以把连续值特征和类别特征选出来看一下
#类别特征
cat_features = list(train.select_dtypes(include=['object']).columns)
print ("Categorical: {} features".format(len(cat_features)))
#连续值特征
cont_features = [cont for cont in list(train.select_dtypes(
include=['float64', 'int64']).columns) if cont not in ['loss', 'id']]
print ("Continuous: {} features".format(len(cont_features)))
Categorical: 116 features
Continuous: 14 features
接下来看看类别值中属性的个数:
cat_uniques = []
for cat in cat_features:
cat_uniques.append(len(train[cat].unique()))
uniq_values_in_categories = pd.DataFrame.from_items([('cat_name', cat_features), ('unique_values', cat_uniques)])
uniq_values_in_categories.head()
画图来看看
fig, (ax1, ax2) = plt.subplots(1,2)
fig.set_size_inches(16,5)
ax1.hist(uniq_values_in_categories.unique_values, bins=50)
ax1.set_title('Amount of categorical features with X distinct values')
ax1.set_xlabel('Distinct values in a feature')
ax1.set_ylabel('Features')
ax1.annotate('A feature with 326 vals', xy=(322, 2), xytext=(200, 38), arrowprops=dict(facecolor='black'))
ax2.set_xlim(2,30)
ax2.set_title('Zooming in the [0,30] part of left histogram')
ax2.set_xlabel('Distinct values in a feature')
ax2.set_ylabel('Features')
ax2.grid(True)
ax2.hist(uniq_values_in_categories[uniq_values_in_categories.unique_values <= 30].unique_values, bins=30)
ax2.annotate('Binary features', xy=(3, 71), xytext=(7, 71), arrowprops=dict(facecolor='black'))