# Kaggle项目——House Prices缺失值填充

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
import warnings
import seaborn as sns
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score
from operator import itemgetter
import itertools
import xgboost as xgb
#忽略警告
warnings.filterwarnings("ignore")
#最小二乘法拟合中的系数矩阵的秩是不足的。只有在满 = False时，才会发出警告。警告可以通过以下语句关闭
warnings.simplefilter('ignore', np.RankWarning)
#将训练数据和测试数据合并，并查看数据特征
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'], test.loc[:,'MSSubClass':'SaleCondition']), ignore_index=True)

## 缺失值查看

#查看缺失值情况
total = all_data.isnull().sum().sort_values(ascending=False)
percent=(all_data.isnull().sum()/len(all_data)).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['total', 'percent'])
missing_data

## 地下室和车库相关数据填充

#先对所有的地下室相关进行个别填充
a = pd.Series(all_data.columns)
BsmtList = a[a.str.contains('Bsmt')].values
# BsmtList:array(['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
#        'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
#        'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'], dtype=object)
condition1 = (all_data['BsmtExposure'].isnull()) & (all_data['BsmtCond'].notnull())#有3个例子
all_data.loc[condition1,'BsmtExposure']=all_data['BsmtExposure'].mode()[0]
condition2 = (all_data['BsmtCond'].isnull()) & (all_data['BsmtQual'].notnull()) # 3个
all_data.loc[condition2,'BsmtCond']=all_data.loc[condition2,'BsmtQual']
condition3 = (all_data['BsmtQual'].isnull()) & (all_data['BsmtExposure'].notnull()) # 2个
all_data.loc[condition3,'BsmtQual'] = all_data.loc[condition3,'BsmtCond']
condition4 = (all_data['BsmtFinType2'].isnull()) & (all_data['BsmtFinSF2']!=0)&(all_data['BsmtFinSF2'].notnull()) #1个
all_data.loc[condition4,'BsmtFinType2'] = all_data['BsmtFinType2'].mode()[0]
GarageList = a[a.str.contains('Garage')].values
condition5=(all_data['GarageYrBlt'].isnull())&(all_data['GarageType'].notnull())#2个
all_data.loc[condition5,'GarageFinish']=all_data[all_data['GarageType']=='Detchd']['GarageFinish'].mode()[0]
all_data.loc[condition5,'GarageQual']=all_data[all_data['GarageType']=='Detchd']['GarageQual'].mode()[0]
all_data.loc[condition5,'GarageCond']=all_data[all_data['GarageType']=='Detchd']['GarageCond'].mode()[0]
condition6=(all_data['GarageCars'].isnull())&(all_data['GarageType'].notnull())#1个
all_data.loc[condition6,'GarageCars']=all_data[all_data['GarageType']=='Detchd']['GarageCars'].mode()[0]
all_data.loc[condition6,'GarageArea']=all_data[all_data['GarageType']=='Detchd']['GarageArea'].median()
#对于剩下的还未填充的，所有类别型填充None，所有数值型填充0，其中由于车库建造年份数据类型特殊，此处暂不填充
#在Garage列表里去除GarageYrBlt
GarageList=['GarageType',  'GarageFinish', 'GarageCars',
'GarageArea', 'GarageQual', 'GarageCond']
Bsmt_cat=all_data[BsmtList].select_dtypes(exclude=[np.number]).columns
Bsmt_num=all_data[BsmtList].select_dtypes(include=[np.number]).columns
Garage_cat=all_data[GarageList].select_dtypes(exclude=[np.number]).columns
Garage_num=all_data[GarageList].select_dtypes(include=[np.number]).columns
all_data[Bsmt_cat]=all_data[Bsmt_cat].fillna('NoBsmt')
all_data[Bsmt_num]=all_data[Bsmt_num].fillna(0)
all_data[Garage_cat]=all_data[Garage_cat].fillna('NoGarage')
all_data[Garage_num]=all_data[Garage_num].fillna(0)

## POOL相关据填充

#对于有泳池面积而没有泳池质量的利用对应的泳池面积进行填充
poolqcna = all_data[(all_data['PoolQC'].isnull())& (all_data['PoolArea']!=0)][['PoolQC','PoolArea']]
areamean = all_data.groupby('PoolQC')['PoolArea'].mean()
for i in poolqcna.index:
v = all_data.loc[i,['PoolArea']].values
all_data.loc[i,['PoolQC']] = np.abs(v-areamean).astype('float64').argmin()#此处必须加上.astype，否则出错，但3.6版本不会出错
#其他的直接填充没有泳池，面积填充0
all_data['PoolQC'] = all_data["PoolQC"].fillna("None")
all_data['PoolArea'] = all_data["PoolArea"].fillna(0)

## MasVnr相关数据填充

MasVnrM = all_data.groupby('MasVnrType')['MasVnrArea'].median()
mtypena = all_data[(all_data['MasVnrType'].isnull())& (all_data['MasVnrArea'].notnull())][['MasVnrType','MasVnrArea']]
for i in mtypena.index:
v = all_data.loc[i,['MasVnrArea']].values
all_data.loc[i,['MasVnrType']] = np.abs(v-MasVnrM).astype('float64').argmin()

all_data['MasVnrType'] = all_data["MasVnrType"].fillna("None")
all_data['MasVnrArea'] = all_data["MasVnrArea"].fillna(0)

## LotFrontage数据填充

x = all_data.loc[np.logical_not(all_data["LotFrontage"].isnull()), "LotArea"]
y = all_data.loc[np.logical_not(all_data["LotFrontage"].isnull()), "LotFrontage"]
t = (x <= 25000) & (y <= 150)
#拟合多项式以填充缺失值
p = np.polyfit(x[t], y[t], 1)
#根据多项式求函数值
all_data.loc[all_data['LotFrontage'].isnull(), 'LotFrontage'] = np.polyval(p, all_data.loc[all_data['LotFrontage'].isnull(), 'LotArea'])

## 其他缺失值填充

#将缺失值的其他类别型变量填充
all_data = all_data.fillna({
'Alley' : 'NoAlley',
'FireplaceQu': 'NoFireplace',
'Fence' : 'NoFence',
'MiscFeature' : 'None',
'MSSubClass':'None'
})

all_data["MSZoning"] = all_data.groupby("MSSubClass")["MSZoning"].transform(lambda x: x.fillna(x.mode()[0]))
all_data['Electrical']=all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
all_data['KitchenQual']=all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])
all_data['Exterior1st']=all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd']=all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])
all_data['SaleType']=all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])
all_data['Utilities']=all_data['Utilities'].fillna(all_data['Utilities'].mode()[0])
# Functional : data description says NA means typical
all_data["Functional"] = all_data["Functional"].fillna("Typ")