零基础入门数据挖掘 - 二手车交易价格预测
1.载入数据科学及相关可视化库文件
下面展示一些 python库文件
。
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
2.导入数据
Train_data=pd.read_csv(path+'used_car_train_20200313.csv',sep=" ")
Test_data=pd.read_csv(path+'used_car_testA_20200313.csv',sep=" ")
print("Train data shape:",Train_data.shape)
print("TestA data shape:",Test_data.shape)
3.数据属性观察
Train_data.head().append(Train_data.tail())
4.判断缺失值和异常
Train_data.isnull().sum()
Train_data['notRepairedDamage'].value_counts()
Train_data['notRepairedDamage'].replace('-',np.nan,inplace=True)
5.了解预测值的分布
6.特征分类特征和数字特征
for cat_fea in categorical_features:
print(cat_fea+"的特征分布如下:")
print("{}特征有{}不同的值".format(cat_fea,Train_data[cat_fea].nunique()))
print(Train_data[cat_fea].value_counts())
7.数字特征分析
for col in numeric_features:
print('{:15}'.format(col),
'skewness:{:05.2f}'.format(Train_data[col].skew()),
' ',
'kurtosis:{:06.2f}'.format(Train_data[col].kurt())
)
8.类型特征分析
9.生成数据报告
pfr = pandas_profiling.ProfileReport(Train_data)
pfr.to_file("./example.html")
后续继续添加