目录:
一. 查看数据
1.1 查看标签
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
df_train = pd.read_csv(r'data\train.csv')
print(df_train.shape)
df_train['SalePrice'].describe()
(1460, 81)
观察一下它的偏度值
print('Skewness: %f' % df_train['SalePrice'].skew())
print('Kurtosis: %f' % df_train['SalePrice'].kurt())
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.distplot(df_train['SalePrice'])
Skewness: 1.882876
Kurtosis: 6.536282
1.2 特征与标签
# 居住面积平方英尺
data = pd.concat([df_train['SalePrice'], df_train['GrLivArea']], axis = 1)
data.plot.scatter(x = 'GrLivArea', y = 'SalePrice')
# 地下室面积平方英尺
data = pd.concat([df_train['SalePrice'], df_train['TotalBsmtSF']], axis = 1)
data.plot.scatter(x = 'TotalBsmtSF', y = 'SalePrice')
# 整体材料和饰面质量
data = df_train[['SalePrice', 'OverallQual']]
plt.subplots(figsize = (8, 6))
sns.boxplot(x = 'OverallQual', y = 'SalePrice', data = data)
data = df_train[['Neighborhood', 'SalePrice']]
plt.subplots(figsize = (10, 6))
sns.boxplot(x = 'Neighborhood', y = 'SalePrice', data = data)
plt.xticks(rotation = 60);
1.3 相关性
corrmat = df_train.corr()
plt.subplots(figsize = (12, 9))
sns.heatmap(corrmat, square = True, cmap = 'Greens');
cols = corrmat.nlargest(10, 'SalePrice')['SalePrice'].index # 数值最大的前十个
cm = np.corrcoef(df_train[cols].values.T)
plt.figure(figsize = (8, 6))
sns.heatmap(cm, cbar = True, annot = True, square = True, fmt = '.2f', cmap = 'Blues',
annot_kws = {
'size': 10}, yticklabels = cols.values, xticklabels = cols.values)
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(df_train[cols], size = 3);
1.4 缺失值
total_missing = df_train.isnull().sum().sort_values(ascending = False)
percent = (df_train.isnull().sum() / len(df_train)).sort_values(ascending = False).round(3)
missing_data = pd.concat([total_missing, percent], axis = 1, keys = ['Total', 'Percent'])
missing_data.head()
二. 数据处理
2.1 去掉离群点
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
train = pd.read_csv(r'data\train.csv')
test = pd.read_csv(r'data\test.csv')
print('The train data size before dropping Id feature is: {}'.format(train.shape))
print('The test data size before dropping Id feature is: {}'.format(test.