二手车价格预测数据探索

最新推荐文章于 2024-01-27 23:18:19 发布

陈yao11

最新推荐文章于 2024-01-27 23:18:19 发布

阅读量360

点赞数

分类专栏： python 文章标签：数据分析

本文链接：https://blog.csdn.net/weixin_40497069/article/details/105076588

版权

python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

二手车价格预测数据探索

1.赛题理解

【类型】属于回归问题。
【数据字段】
训练数据字段：

字段名字	含义	类型
name	汽车编码	int
regDate	汽车注册时间	int
model	车型编码	float
brand	品牌	int
bodyType	车身类型	float
fuelType	燃油类型	float
gearbox	变速箱	float
power	汽车功率	int
kilometer	汽车行驶公里	float
notRepairedDamage	汽车有没有修复的损坏	object
regionCode	看车地区编码	int
seller	销售方	int
offerType	报价类型	int
creatDate	广告发布时间	int
price	汽车价格	int

tips:还有v0-v14匿名数据【人工构造，匿名特征】
【观察数据】

Train_data.head().append（Train_data,tail())#输出前10行数据
Train_data.shape #输出行列
Train_data.describe() #输出数据的相关统计量:个数，平均值，方差，最小值，中位数25%，50%，75%，最大值
Train_data.info()#输出数据类型

【判断数据缺失或者异常】

#1.查看数据缺失nan
Train_data.isnull().sum()#查看每列存在的nan情况
#可视化看缺省值柱状图,如果很小一般选择填充，多考虑删掉过多、可以考虑删掉
missing = Train_data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()
# 可视化看下缺省值
msno.matrix(Train_data.sample(250))
msno.bar(Train_data.sample(1000))
#2.查看异常值检测
Train_data.info()
Train_data['notRepairedDamage'].value_counts()#此列为object,统计其信息，有1,0和-
Train_data['notRepairedDamage'].replace('-', np.nan, inplace=True)#替换“-”为nan
Train_data['notRepairedDamage'].value_counts()#重新看数据分布

Train_data["seller"].value_counts()#seller分布不均
Train_data["offerType"].value_counts()#同上

del Train_data["seller"]#所以删掉这两类
del Train_data["offerType"]

#3.了解预测值的分布
Train_data['price']
Train_data['price'].value_counts()

## 1) 总体分布概况（无界约翰逊分布等）
import scipy.stats as st
y = Train_data['price']
plt.figure(1); plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=st.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)
#价格不服从正态分布，所以在进行回归之前，它必须进行转换。虽然对数变换做得很好，但最佳拟合是无界约翰逊分布
## 2) 查看skewness and kurtosis
sns.distplot(Train_data['price']);
print("Skewness: %f" % Train_data['price'].skew())
print("Kurtosis: %f" % Train_data['price'].kurt())

Train_data.skew(), Train_data.kurt()
sns.distplot(Train_data.skew(),color='blue',axlabel ='Skewness')
sns.distplot(Train_data.kurt(),color='orange',axlabel ='Kurtness')
#https://www.cnblogs.com/wyy1480/p/10474046.html
#https://www.cnblogs.com/wyy1480/p/10474046.html
## 3) 查看预测值的具体频数
plt.hist(Train_data['price'], orientation = 'vertical',histtype = 'bar', color ='red')
plt.show()

# log变换 z之后的分布较均匀，可以进行log变换进行预测，这也是预测问题常用的trick
plt.hist(np.log(Train_data['price']), orientation = 'vertical',histtype = 'bar', color ='red')
plt.show()

数据的特征分为类别特征和数字特征，查看类别特征的分布

# 分离label即预测值
Y_train = Train_data['price']
# 数字特征
# numeric_features = Train_data.select_dtypes(include=[np.number])
# numeric_features.columns
# # 类型特征
# categorical_features = Train_data.select_dtypes(include=[np.object])
# categorical_features.columns

# 类别特征nunique分布
for cat_fea in categorical_features:
print(cat_fea + "的特征分布如下：")
print("{}特征有个{}不同的值".format(cat_fea, Train_data[cat_fea].nunique()))
print(Train_data[cat_fea].value_counts())

#数字特征分析
numeric_features.append('price')
numeric_features
Train_data.head()

## 1) 相关性分析
price_numeric = Train_data[numeric_features]
correlation = price_numeric.corr()
print(correlation['price'].sort_values(ascending = False),'\n')

#数字特征相关性可视化
f , ax = plt.subplots(figsize = (7, 7))
plt.title('Correlation of Numeric Features with Price',y=1,size=16)
sns.heatmap(correlation,square = True, vmax=0.8)

del price_numeric['price']

## 2) 查看几个特征得 偏度和峰值
for col in numeric_features:
print('{:15}'.format(col),
'Skewness: {:05.2f}'.format(Train_data[col].skew()) ,
' ' ,'Kurtosis: {:06.2f}'.format(Train_data[col].kurt())
)

## 3) 每个数字特征得分布可视化
f = pd.melt(Train_data, value_vars=numeric_features)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")

## 4) 数字特征相互之间的关系可视化
#多变量之间关系可视化文档https://www.jianshu.com/p/6e18d21a4cad
columns = ['price', 'v_12', 'v_8' , 'v_0', 'power', 'v_5', 'v_2', 'v_6', 'v_1', 'v_14']
sns.pairplot(Train_data[columns],size = 2 ,kind ='scatter',diag_kind='kde')
plt.show()

Train_data.columns

## 5) 多变量互相回归关系可视化
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8), (ax9, ax10)) = plt.subplots(nrows=5, ncols=2,
# ['v_12', 'v_8' , 'v_0', 'power', 'v_5', 'v_2', 'v_6', 'v_1', 'v_14']
v_12_scatter_plot = pd.concat([Y_train,Train_data['v_12']],axis = 1)
sns.regplot(x='v_12',y = 'price', data = v_12_scatter_plot,scatter= True, fit_reg=True, ax=ax1)
v_8_scatter_plot = pd.concat([Y_train,Train_data['v_8']],axis = 1)
sns.regplot(x='v_8',y = 'price',data = v_8_scatter_plot,scatter= True, fit_reg=True, ax=ax2)
v_0_scatter_plot = pd.concat([Y_train,Train_data['v_0']],axis = 1)
sns.regplot(x='v_0',y = 'price',data = v_0_scatter_plot,scatter= True, fit_reg=True, ax=ax3)
power_scatter_plot = pd.concat([Y_train,Train_data['power']],axis = 1)
sns.regplot(x='power',y = 'price',data = power_scatter_plot,scatter= True, fit_reg=True, ax=ax4)
v_5_scatter_plot = pd.concat([Y_train,Train_data['v_5']],axis = 1)
sns.regplot(x='v_5',y = 'price',data = v_5_scatter_plot,scatter= True, fit_reg=True, ax=ax5)
v_2_scatter_plot = pd.concat([Y_train,Train_data['v_2']],axis = 1)
sns.regplot(x='v_2',y = 'price',data = v_2_scatter_plot,scatter= True, fit_reg=True, ax=ax6)
v_6_scatter_plot = pd.concat([Y_train,Train_data['v_6']],axis = 1)
sns.regplot(x='v_6',y = 'price',data = v_6_scatter_plot,scatter= True, fit_reg=True, ax=ax7)
v_1_scatter_plot = pd.concat([Y_train,Train_data['v_1']],axis = 1)
sns.regplot(x='v_1',y = 'price',data = v_1_scatter_plot,scatter= True, fit_reg=True, ax=ax8)
v_14_scatter_plot = pd.concat([Y_train,Train_data['v_14']],axis = 1)
sns.regplot(x='v_14',y = 'price',data = v_14_scatter_plot,scatter= True, fit_reg=True, ax=ax9)
v_13_scatter_plot = pd.concat([Y_train,Train_data['v_13']],axis = 1)
sns.regplot(x='v_13',y = 'price',data = v_13_scatter_plot,scatter= True, fit_reg=True, ax=ax10

类别特征可视化

## 1) unique分布
for fea in categorical_features:
print(Train_data[fea].nunique())

categorical_features
# 因为 name和 regionCode的类别太稀疏了，这里我们把不稀疏的几类画一下
categorical_features = ['model',
'brand',
'bodyType',
'fuelType',
'gearbox',
'notRepairedDamage']
for c in categorical_features:
Train_data[c] = Train_data[c].astype('category')
if Train_data[c].isnull().any():
Train_data[c] = Train_data[c].cat.add_categories(['MISSING'])
Train_data[c] = Train_data[c].fillna('MISSING')
def boxplot(x, y, **kwargs):
sns.boxplot(x=x, y=y)
x=plt.xticks(rotation=90)
f = pd.melt(Train_data, id_vars=['price'], value_vars=categorical_features)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(boxplot, "value", "price")

## 3) 类别特征的小提琴图可视化
catg_list = categorical_features
target = 'price'
for catg in catg_list :
sns.violinplot(x=catg, y=target, data=Train_data)
plt.show()

categorical_features = ['model',
'brand',
'bodyType',
'fuelType',
'gearbox',
'notRepairedDamage']

## 4) 类别特征的柱形图可视化
def bar_plot(x, y, **kwargs):
sns.barplot(x=x, y=y)
x=plt.xticks(rotation=90)
f = pd.melt(Train_data, id_vars=['price'], value_vars=categorical_features)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(bar_plot, "value", "price")

## 5) 类别特征的每个类别频数可视化(count_plot)
def count_plot(x, **kwargs):
sns.countplot(x=x)
x=plt.xticks(rotation=90)
f = pd.melt(Train_data, value_vars=categorical_features)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(count_plot, "value")

2.总结

数据探索在机器学习中我们一般称为EDA（Exploratory Data Analysis）：是指对已有的数据（特别是调查或观察得来的原始数据）在尽量少的先验假定下进行探索，通过作图、制表、方程拟合、计算特征量等手段探索数据的结构和规律的一种数据分析方法。

对于数据的初步分析（直接查看数据，或.sum(), .mean()，.descirbe()等统计函数）可以从：样本数量，训
练集数量，是否有时间特征，特征所表示的含义（非匿名特征），特征类型（字符类似，int，float，time），特征的缺失情况（注意缺失的在数据中的表现形式，有些是空的有些是”NAN”符号等），特征的均值方差情况。
分析记录某些特征值缺失占比30%以上样本的缺失处理，有助于后续的模型验证和调节，分析特征应该是填充（填充方式是什么，均值填充，0填充，众数填充等），还是舍去，还是先做样本分类用不同的特征模型去预测。
对于异常值做专门的分析，分析特征异常的label是否为异常值（或者偏离均值较远或者事特殊符号）,异常值
是否应该剔除，还是用正常值填充，是记录异常，还是机器本身异常等。
对于Label做专门的分析，分析标签的分布情况等。
进步分析可以通过对特征作图，特征和label联合做图（统计图，离散图），直观了解特征的分布情况，通过这一步也可以发现数据之中的一些异常值等，通过箱型图分析一些特征值的偏离情况，对于特征和特征联合作图，对于特征和label联合作图，分析其中的一些关联性。

陈yao11

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
二手车价格预测数据探索

二手车价格预测数据探索1.赛题理解【类型】属于回归问题。【数据字段】训练数据字段：字段名字含义类型name汽车编码intregDate汽车注册时间intmodel车型编码floatbrand品牌intbodyType车身类型floatfuelType燃油类型floatgearbox变速箱float...
复制链接

扫一扫