task5 二手车交易价格预测
1.基础数据
2.代码实现
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#数据读取
train_data = pd.read_csv('D:/网页下载/used_car_train_20200313/used_car_train_20200313.csv', sep=' ')
test_data = pd.read_csv('D:/网页下载/used_car_train_20200313/used_car_testB_20200421/used_car_testB_20200421.csv', sep = ' ')
train_data.head()
空值,异常值检测:
train_data.isnull().sum()
print(train_data.dtypes)
如存在object类型的数据需要对数据进行检查和类型的转换
这里发现notRepairedDamage存在异常值
notRepairedDamage = train_data['notRepairedDamage'].values.tolist()
for i in range(len(notRepairedDamage)):
if notRepairedDamage[i][0] == '-':
notRepairedDamage[i] = 0.0
else:
notRepairedDamage[i] = float(notRepairedDamage[i])
train_data['notRepairedDamage'] = pd.DataFrame(notRepairedDamage)
notRepairedDamage = test_data['notRepairedDamage'].values.tolist()
for i in range(len(notRepairedDamage)):
if notRepairedDamage[i][0] == '-':
notRepairedDamage[i] = 0.0
else:
notRepairedDamage[i] = float(notRepairedDamage[i])
test_data['notRepairedDamage'] = pd.DataFrame(notRepairedDamage)
空值处理
for col in('fuelType', 'gearbox', 'bodyType'):
train_data[col] = train_data[col].fillna(train_data[col].mode()[0])
test_data[col] = train_data[col].fillna(test_data[col].mode()[0]) #这里选择替换为众数
test_data.fillna(0.0)
train_data.fillna(0.0)
再次检查,无空值
train_data.isnull().sum()
对价格数据进行了解
train_data['price'].describe()
sns.displot(train_data['price'])
plt.show()
注册日期是年月日的形式,这里我改成了年份来表征
#把注册日期改为年份
regDate = train_data['regDate'].values.tolist()
for i in range(len(regDate)):
regDate[i] = int(str(regDate[i])[:4])
train_data['regDate'] = pd.DataFrame(regDate)
对相关性进行分析并进行排序
k = 31
highcor_var = corramt.nlargest(k, 'price')['price'].index
highcor_data = train_data.loc[:, highcor_var]
fig, ax = plt.subplots(figsize = (30, 30))
fig = sns.heatmap(highcor_data.corr(), annot = True, square = True)
plt.title('the highest variables')
print(highcor_var)
散点图的形式表示
sns.set()
cols = ['price', 'v_12', 'v_8', 'v_0', 'regDate', 'gearbox', 'bodyType',
'power', 'fuelType', 'v_5', 'model']
sns.pairplot(train_data[cols], size = 2.5)
plt.show()
选择几个相关性较强的因素,观察其规律
data = pd.concat([train_data['price'], train_data['v_12']], axis = 1)
data.plot.scatter(x = 'v_12', y = 'price', ylim=(0,110000))
plt.show()
data = pd.concat([train_data['price'], train_data['v_0']], axis = 1)
data.plot.scatter(x = 'v_0', y = 'price', ylim=(0,110000))
plt.show()
箱线图形式:
var1 = 'fuelType'
data = pd.concat([train_data['price'], train_data['fuelType']], axis = 1)
fig = sns.boxplot(x=var1, y='price', data = data)
fig.axis(ymin = 0, ymax = 110000)
plt.title('fuelType and price relation[box]')
var2 = 'bodyType'
data = pd.concat([train_data['price'], train_data['bodyType']], axis = 1)
fig = sns.boxplot(x=var2, y='price', data = data)
fig.axis(ymin = 0, ymax = 110000)
plt.title('bodyType and price relation[box]')
var3 = 'gearbox'
data = pd.concat([train_data['price'], train_data['gearbox']], axis = 1)
fig = sns.boxplot(x=var3, y='price', data = data)
fig.axis(ymin = 0, ymax = 110000)
plt.title('gearbox and price relation[box]')
var4 = 'regDate'
data = pd.concat([train_data['price'], train_data['regDate']], axis = 1)
fig, ax = plt.subplots(figsize=(12, 9))
fig = sns.boxplot(x=var4, y='price', data = data)
fig.axis(ymin = 0, ymax = 110000)
plt.title('regDate and price relation[box]')
模型训练和预测
from sklearn.ensemble import GradientBoostingRegressor
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,max_depth=4, max_features='sqrt',min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =5)
cols = [ 'v_12', 'v_8', 'v_0', 'regDate', 'gearbox', 'bodyType',
'power', 'fuelType', 'v_5', 'model', 'notRepairedDamage', 'v_3','v_11','v_10','v_9','v_4','kilometer'] #只选择相关性较强的数据进行训练
test_x = test_data[cols].values
x = train_data[cols].values
y = train_data['price'].values
GBoost.fit( x, y)
y_pred = GBoost.predict(x_test)
生成结果文件
prediction = pd.DataFrame(y_pred, columns=['price'])
res = pd.concat([test_data['SaleID'], prediction], axis=1)
res.columns
res.to_csv('./prediction.csv', index=False)