数据挖掘EDA学习
"""
EDA-数据探索性分析
1.EDA的价值主要在于熟悉数据集,了解数据集,对数据集进行验证来确定所获得数据集可以用于
接下来的机器学习或者深度学习使用。
2.当了解了数据集之后我们下一步就是要去了解变量间的相互关系以及变量与预测值之间的存在关系。
3.引导数据科学从业者进行数据处理以及特征工程的步骤,使数据集的结构和特征集让接下来的预测
问题更加可靠。
4.完成对于数据的探索性分析,并对于数据进行一些图表或者文字总结并打卡
"""
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
path='./data/'
Train_data=pd.read_csv(path+'used_car_train.csv',sep=' ')
Test_data=pd.read_csv(path+'used_car_testA.csv',sep=' ')
Train_data.head().append(Train_data.tail())
descibe=Train_data.describe()
info=Train_data.info()
missing=Train_data.isnull().sum()
missing=missing[missing>0]
missing.sort_values(inplace=True)
missing.plot.bar()
msno.matrix(Train_data.sample(1000))
msno.bar(Train_data.sample(100000))
Train_data['notRepairedDamage'].value_counts()
Train_data['notRepairedDamage'].replace('-',np.nan,inplace=True)
Test_data['notRepairedDamage'].value_counts()
Train_data['notRepairedDamage'].replace('-',np.nan,inplace=True)
Train_data['seller'].value_counts()
Test_data['offerType'].value_counts()
del Train_data["seller"]
del Train_data["offerType"]
del Test_data["seller"]
del Test_data["offerType"]
Train_data['price'].value_counts()
import scipy.stats as st
y = Train_data['price']
plt.figure(1); plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=st.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)
sns.distplot(Train_data['price']);
print("Skewness: %f" % Train_data['price'].skew())
print("Kurtosis: %f" % Train_data['price'].kurt())
Train_data.skew(), Train_data.kurt()
sns.distplot(Train_data.skew(),color='blue',axlabel ='Skewness')
sns.distplot(Train_data.kurt(),color='orange',axlabel ='Kurtness',kde=False)
sns.distplot(Train_data['price'],color='red',kde=False,bins=8)
Y_train=Train_data['price']
categorical_features = ['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox',
'notRepairedDamage', 'regionCode',]
for cat_fea in categorical_features:
print(cat_fea + "的特征分布如下:")
print("{}特征有个{}不同的值".format(cat_fea, Train_data[cat_fea].nunique()))
print(Train_data[cat_fea].value_counts())
numeric_features = ['power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5',
'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13','v_14' ]
numeric_features.append('price')
price_numeric = Train_data[numeric_features]
correlation = price_numeric.corr()
print(correlation['price'].sort_values(ascending = False),'\n')
ax1=plt.subplots(figsize = (5, 5))
plt.title('Correlation of Numeric Features with Price',y=1,size=16)
sns.heatmap(correlation,square = True, vmax=0.8,annot=False)
del price_numeric['price']
for col in numeric_features:
print('{:15}'.format(col),
'Skewness: {:05.2f}'.format(Train_data[col].skew()) ,
' ' ,
'Kurtosis: {:06.2f}'.format(Train_data[col].kurt())
)
f = pd.melt(Train_data, id_vars=[],value_vars=numeric_features)
g = sns.FacetGrid(f, col="variable", col_wrap=3, sharex=False, sharey=False)
g=g.map(sns.distplot,"value")
sns.set()
columns=['price', 'v_12', 'v_8' , 'v_0', 'power', 'v_5', 'v_2', 'v_6', 'v_1', 'v_14']
sns.pairplot(Train_data[columns],size = 2 ,kind ='scatter',diag_kind='kde')
plt.show()
fig,(ax1,ax2)=plt.subplots(nrows=1, ncols=2, figsize=(24, 20))
v_12_scatter_plotData = pd.concat([Y_train,Train_data['v_12']],axis = 1)
sns.regplot(x='v_12',y='price',data=v_12_scatter_plotData,scatter= True, fit_reg=True, ax=ax1)
v_8_scatter_plotData = pd.concat([Y_train,Train_data['v_8']],axis = 1)
sns.regplot(x='v_8',y = 'price',data = v_8_scatter_plotData,scatter= True, fit_reg=True, ax=ax2)
import pandas_profiling
pfr = pandas_profiling.ProfileReport(Train_data)
pfr.to_file("./example.html")