数据的探索性分析(EDA)
对于一个数据挖掘任务而言,基本的套路都是从数据分析开始。一次完整的数据分析流程参考如下:
导入库
import sys
import gc
import pandas as pd
#显示所有列
pd.set_option(‘display.max_columns’, None)
#显示所有行
pd.set_option(‘display.max_rows’, None)
#设置value的显示长度为100,默认为50
pd.set_option(‘max_colwidth’,100)
import matplotlib
import numpy as np
import scipy as sp
from scipy import stats
from scipy.stats import norm,skew,kurtosis #for some statistics
import IPython
from IPython import display
import sklearn
import random
import time
import pickle
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#ignore warnings
import warnings
warnings.filterwarnings(‘ignore’)
print(’-’*25)
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier
#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix
#Configure Visualization Defaults
%matplotlib inline
mpl.style.use(‘ggplot’)
sns.set_style(‘white’)
pylab.rcParams[‘figure.figsize’] = 12,8
#Titanic-kernels
分类
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier
#Common Model Helpers
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from pandas.tools.plotting import scatter_matrix
train_data = pd.read_csv(’./input/filename.csv’,sep=’,’,names= [’’],header = None) #index_col=False, encoding=‘GB2312’,parse_dates=True,encdoing=‘utf8’,dtype={‘user_id’:‘uint32’,‘register_day’:‘uint8’,
‘register_type’:‘uint8’,‘device_type’:‘uint32’}
#dateparse = lambda dates: pd.datetime.strptime(dates,’%Y-%m-%d %H-%M-%S’)
#train_data= pd.read_csv(’./input/filename.csv’, parse_dates=[‘time’], date_parser=dateparse, index_col=[‘time’]) 使用时间索引
train_data = pd.read_csv(’…/input/train_dataset.csv’,sep=’,’)
test_data = pd.read_csv(’…/input/test_dataset.csv’,sep=’,’)
submit_example = pd.read_csv(’…/input/submit_example.csv’,sep=’,’)
print(‘train shape:’,train_data.shape)
print(‘test shape:’,test_data.shape)
print(‘sample shape:’,submit_example.shape)
改变变量类型节省内存空间
def reduce_mem_usage(df, verbose=True):
numerics = [‘int16’, ‘int32’, ‘int64’, ‘float16’, ‘float32’, ‘float64’]
start_mem = df.memory_usage().sum() / 10242
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == ‘int’:
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 10242
if verbose: print(‘Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)’.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
return df
合并数据集
target_col = “tartget”
target = train_data[target_col]#.apply(lambda x:np.log1p(x))
del train_data[target_col]
data = pd.concat([train_data,test_data],axis=0,ignore_index=True)
data = data.fillna(-1)
SpecifierCommon EquivalentSigningBitsBytesMinimum ValueMaximum Valueint8_tsigned charSigned81−128127uint8_tunsigned charUnsigned810255int16_tshortSigned162−32,76832,767uint16_tunsigned shortUnsigned162065,535int32_tintSigned324−2,147,483,6482,147,483,647uint32_tunsigned intUnsigned32404,294,967,295int64_tlong longSigned648−9,223,372,036,854,775,8089,223,372,036,854,775,807uint64_tunsigned long longUnsigned648018,446,744,073,709,551,615
train_data .to_csv(’./output/by_filename.csv’,sep=’,’,cols=[’’],index=False) #header=False
train_data.describe()
数据预处理:
数据清洗 不可信的样本丢(清除异常样本) 缺省值极多的字段考虑不用
数据采样 下/上采样 保证样本均衡
#analysis
def stat_df(df):
stats = []
for col in df.columns:
stats.append((col, df[col].nunique(), df[col].isnull().sum() * 100 / df.shape[0], df[col].value_counts(normalize=True, dropna=False).values[0] * 100, df[col].dtype))
stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values', 'Percentage of values in the biggest category', 'type'])
stats_df.sort_values('Percentage of missing values', ascending=False,inplace=True)
return stats_df
stat_df(train_data)
#analysis columns
for col in train_data.columns:
print(‘feature:’,col)
print(’\t Percentage of missing values:\n\t\t train:’,train_data[col].isnull().sum() * 100 / train_data.shape[0])
print(’\t\t test:’,test_data[col].isnull().sum() * 100 / test_data.shape[0])
print(’\t feature unique:\n\t\t train:’,train_data[col].nunique())
print(’\t\t test:’,test_data[col].nunique())
print(’\t feature type:’,data[col].dtype)
for col in data.columns:
print(‘feature:’,col)
print(‘feature value_counts:\ntrain:’,train_data[col].value_counts().head(10))
print(‘test:’,test_data[col].value_counts().head(10))
print(’-------------------------’)
#日期处理
train_data [‘time’] = pd.to_datetime(train_data [‘time’])
train_data [‘time’] = train_data [‘time’].astype(datetime)
#根据时间戳获取天数、小时
train_df[‘day’] = train_df[‘time’].apply(lambda x: int(time.strftime("%d", time.localtime(x))))
train_df[‘hour’] = train_df[‘time’].apply(lambda x: int(time.strftime("%H", time.localtime(x))))
train[“time_sting”]=train[“context_timestamp”].apply(lambda x:time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(x)))#广告商品的展示时间 时间戳转化为时间
train[“time_sting”]=pd.to_datetime(train[“time_sting”])
train[“hour”]=train[“time_sting”].dt.hour
train[“day”]=train[“time_sting”].dt.day
train[“day”]=train[“day”].apply(lambda x:0 if x==31 else x)
grouped_df = train.groupby([“day”, “hour”])[“is_trade”].aggregate(“mean”).reset_index()
grouped_df = grouped_df.pivot(‘day’, ‘hour’, ‘is_trade’)
缺失值处理
#missing data
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({‘Missing Ratio’ :all_data_na})
missing_data.head(20)
连续型特征
查看变量的缺失值(missing value,空值)个数以及所占比例
1)剔除掉缺失率>80%并且未来依然会高缺失的变量
2)如果“缺失”包含了业务含义,保留该变量,并且直接将缺失值填充为默认值。
3)如果“缺失”没有业务含义:
a)连续型变量:偏正态分布,用均值填充,可以保持数据的均值;偏长尾分布,使用中值填充,避免受异常值的影响。
编程上可以用sklearn.preprocessing中的Imputer实现;
b)若缺失量比较大,不适合使用平均值填补。一般通过使用其它变量预测或者直接将缺失值设置为默认值的方法填补;
离散型特征
1、查看变量的缺失值(missing value,空值)个数以及所占比例
1)剔除掉缺失率>80%并且未来依然会高缺失的变量
2)如果“缺失”包含了业务含义,保留该变量,并且直接将缺失值填充为默认值。
3)如果“缺失”没有业务含义
a)离散型变量,用众数填充;
b)若缺失量比较大,不适合使用中位数或者平均值填补。一般通过使用其它变量预测或者直接将缺失值设置为默认值的方法填补;
1、填充众数、中位数
train_data[‘Embarked’][train_data.Embarked.isnull()] = train_data.Embarked.dropna().mode().values #众数
train_data[‘Age’].fillna(train_data[‘Age’].median(), inplace = True)#中位数
all_data[“LotFrontage”] = all_data.groupby(“Neighborhood”)[“LotFrontage”].transform(
lambda x: x.fillna(x.median()))#分组后填充中位数
2、赋一个代表缺失的值
train_data.Cabin[train_data.Cabin.isnull()] = ‘U0’
data[data.select_dtypes(‘object’).columns.tolist()] = data.select_dtypes(‘object’).fillna("-999")
data.fillna(-999, inplace=True)
3、使用模型预测
#建立Age的预测模型,我们可以多模型预测,然后再做模型的融合,提高预测的精度
from sklearn import ensemble
from sklearn import model_selection
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
def fill_missing_age(missing_age_train, missing_age_test):
missing_age_X_train = missing_age_train.drop([‘Age’], axis=1)
missing_age_Y_train = missing_age_train[‘Age’]
missing_age_X_test = missing_age_test.drop([‘Age’], axis=1)
# model 1 gbm
gbm_reg = GradientBoostingRegressor(random_state=42)
gbm_reg_param_grid = {'n_estimators': [2000], 'max_depth': [2,3], 'learning_rate': [0.01], 'max_features': [3,4,5]}
gbm_reg_grid = model_selection.GridSearchCV(gbm_reg, gbm_reg_param_grid, cv=10, n_jobs=25, verbose=1, scoring='neg_mean_squared_error')
gbm_reg_grid.fit(missing_age_X_train, missing_age_Y_train)
print('Age feature Best GB Params:' + str(gbm_reg_grid.best_params_))
print('Age feature Best GB Score:' + str(gbm_reg_grid.best_score_))
print('GB Train Error for "Age" Feature Regressor:' + str(gbm_reg_grid.score(missing_age_X_train, missing_age_Y_train)))
missing_age_test.loc[:, 'Age_GB'] = gbm_reg_grid.predict(missing_age_X_test)
print(missing_age_test['Age_GB'][:4])
# model 2 rf
rf_reg = RandomForestRegressor()
rf_reg_param_grid = {'n_estimators': [2000], 'max_depth': [4,5]} #'random_state': [0]
rf_reg_grid = model_selection.GridSearchCV(rf_reg, rf_reg_param_grid, cv=10, n_jobs=25, verbose=1, scoring='neg_mean_squared_error')
rf_reg_grid.fit(missing_age_X_train, missing_age_Y_train)
print('Age feature Best RF Params:' + str(rf_reg_grid.best_params_))
print('Age feature Best RF Score:' + str(rf_reg_grid.best_score_))
print('RF Train Error for "Age" Feature Regressor' + str(rf_reg_grid.score(missing_age_X_train, missing_age_Y_train)))
missing_age_test.loc[:, 'Age_RF'] = rf_reg_grid.predict(missing_age_X_test)
print(missing_age_test['Age_RF'][:4])
# two models merge
print('shape1', missing_age_test['Age'].shape, missing_age_test[['Age_GB', 'Age_RF']].mode(axis=1).shape)
# missing_age_test['Age'] = missing_age_test[['Age_GB', 'Age_LR']].mode(axis=1)
missing_age_test.loc[:, 'Age'] = np.mean([missing_age_test['Age_GB'], missing_age_test['Age_RF']])
print(missing_age_test['Age'][:4])
missing_age_test.drop(['Age_GB', 'Age_RF'], axis=1, inplace=True)
return missing_age_test
combined_train_test.loc[(combined_train_test.Age.isnull()), ‘Age’] = fill_missing_age(missing_age_train, missing_age_test)
异常值处理
#explor outliers
fig, ax = plt.subplots()
ax = plt.scatter(x = train[‘GrLivArea’], y = train[‘SalePrice’])
plt.ylabel(‘SalePrice’, fontsize=13)
plt.xlabel(‘GrLivArea’, fontsize=13)
plt.show() #画出面积与价格的关系,删除异常值
train = train.drop(train[(train[‘GrLivArea’]>4000) & (train[‘SalePrice’]<300000)].index)#Deleting outliers
#训练数据中可能存在其他异常值。 然而,如果测试数据中也存在异常值,则删除它们可能会严重影响我们的模型。 这就是为什么,而不是全部删除它们,我们只会设法使它们的某些模型更加健壮。
#数据划分
df_part_1 = df[‘2014-11-18’:‘2014-11-23’]#按照时间窗口滑动
#计算皮尔逊相关系数
def correlation_heatmap(df):
_ , ax = plt.subplots(figsize =(14, 12))
colormap = sns.diverging_palette(220, 10, as_cmap = True)
_ = sns.heatmap(
df.corr(),
cmap = colormap,
square=True,
cbar_kws={'shrink':.9 },
ax=ax,
annot=True,
linewidths=0.1,vmax=1.0, linecolor='white',
annot_kws={'fontsize':12 }
)
plt.title('Pearson Correlation of Features', y=1.05, size=15)
correlation_heatmap(data1)
计算相关系数
cols = [col for col in train_data.columns if col not in [target_col] if train_data[col].dtype!=‘object’]
labels = []
values = []
for col in cols:
labels.append(col)
values.append(np.corrcoef(train_data[col].values,train_data[target_col].values)[0,1])
corr_df = pd.DataFrame(‘col_labels’:labels,‘corr_values’:values)
corr_df = pd.DataFrame({‘col_labels’:labels,‘corr_values’:values})
corr_df = corr_df.sort_values(by = ‘corr_values’)
ind = np.arange(len(labels))
width=0.5
fig,ax = plt.subplots(figsize=(12,40))
rects = ax.barh(ind,np.array(corr_df[‘corr_values’].values),color=‘y’)
ax.set_yticks(ind)
ax.set_yticklabels(corr_df.col_labels.values,rotation=‘horizontal’)
ax.set_xlabel(‘Correlation coefficient’)
ax.set_title(‘Correlation cofficient of the variables’)
离散值散点图
target_col = “target”
plt.figure(figsize=(8,6))
plt.scatter(range(len(np.sort(train_data[target_col].values))), np.sort(train_data[target_col].values))
plt.xlabel(‘index’, fontsize=12)
plt.ylabel(‘yield’, fontsize=12)
plt.show()
fig, ax = plt.subplots()
ax = plt.scatter(x = train_data[‘B14’], y = train_data[‘收率’])
plt.ylabel(‘B14’, fontsize=13)
plt.xlabel(‘收率’, fontsize=13)
plt.show()
#target
target_col = ‘target’
#We use the numpy fuction log1p which applies log(1+x) to all elements of the column
train_data[target_col] =(train_data[target_col])#
#Check the new distribution
sns.distplot(train_data[target_col] , fit=norm);
(mu, sigma) = norm.fit(train_data[target_col])
print( ‘\n mu = {:.2f} and sigma = {:.2f}\n’.format(mu, sigma))
#Now plot the distribution
plt.legend([‘Normal dist. (
μ
=
\mu=
μ= {:.2f} and
σ
=
\sigma=
σ= {:.2f} )’.format(mu, sigma)],
loc=‘best’)
plt.ylabel(‘Frequency’)
plt.title(‘SalePrice distribution’)
#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train_data[target_col], plot=plt)
plt.show()
观察变量的密度曲线
plt.figure(figsize=(8,6))
data[feature_col].plot(kind=‘kde’)
data[feature_col].hist(bins=20)