人工智能的比赛是很考验实际实践能力的,我觉得与其想系统学完理论在打比赛,不如直接在比赛中学习。
比赛是kaggle上的house prices。
导入相应的包:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as ppf
import warnings##忽略警告
warnings.filterwarnings('ignore')
%matplotlib inline # 图直接在页面显示,而不是弹出一个窗口
plt.style.use('ggplot') # 使用自带的样式进行美化
%matplotlib inline是jupyter notebook里的命令, 意思是将那些用matplotlib绘制的图显示在页面里而不是弹出一个窗口。在pycharm里这句话会报错。
导入做特征工程的包:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.preprocessing import LabelEncoder#标签编码
from sklearn.preprocessing import RobustScaler, StandardScaler#去除异常值与数据标准化
from sklearn.pipeline import Pipeline, make_pipeline#构建管道
from scipy.stats import skew#偏度
from sklearn.preprocessing import Imputer
数据探索性分析:
ppf.ProfileReport(train) # 用ppf包得出训练集的报告
plt.figure(figsize=(10,8))
sns.boxplot(train.YearBuilt, train.SalePrice)##箱型图是看异常值的,离群点
# 画一个散点图
plt.figure(figsize=(12,6))
plt.scatter(x=train.GrLivArea, y=train.SalePrice)##可以用来观察存在线型的关系
plt.xlabel("GrLivArea", fontsize=13)
plt.ylabel("SalePrice", fontsize=13)
plt.ylim(0,800000)
# 根据图删除离散值
train.drop(train[(train["GrLivArea"]>4000)&(train["SalePrice"]<300000)].index,inplace=True)#pandas 里面的条件索引
# 把训练集和测试集合在一起处理,为了方便。
full = pd.concat([train,test],ignore_index=True)
# ID列没什么用删除了
full.drop("Id",axis=1,inplace=True)
# full.info()#查看数据的一个信息
数据清洗——空值的处理(填充,删除,不处理)
miss = full.isnull().sum()#统计出空值的个数
miss[miss>0].sort_values(ascending=True)#由低到高排好序
# 对字符类型的进行填充 用None填充空值
cols1 = ["PoolQC" , "MiscFeature", "Alley", "Fence", "FireplaceQu", "GarageQual", "GarageCond", "GarageFinish", "GarageYrBlt", "GarageType", "BsmtExposure", "BsmtCond", "BsmtQual", "BsmtFinType2", "BsmtFinType1", "MasVnrType"]
for col in cols1:
full[col].fillna("None",inplace=True)
# 对数值类型的进行填充 用0填充
cols=["MasVnrArea", "BsmtUnfSF", "TotalBsmtSF", "GarageCars", "BsmtFinSF2", "BsmtFinSF1", "GarageArea"]
for col in cols:
full[col].fillna(0, inplace=True)
# 对lotfrontage的空值进行填充(用这一列的均值)
full["LotFrontage"].fillna(np.mean(full["LotFrontage"]),inplace=True)
# 对这些列进行众数填充
cols2 = ["MSZoning", "BsmtFullBath", "BsmtHalfBath", "Utilities", "Functional", "Electrical", "KitchenQual", "SaleType","Exterior1st", "Exterior2nd"]
for col in cols2:
full[col].fillna(full[col].mode()[0], inplace=True)
数据预处理——字符型变成数值型
# 将一些数字特征转换为类别特征。最好使用LabelEncoder和get_dummies来实现这些功能。
for col in cols2:
full[col]=full[col].astype(str) ##astype来进行数据转换成字符串类型
lab = LabelEncoder()
full["Alley"] = lab.fit_transform(full.Alley)
full["PoolQC"] = lab.fit_transform(full.PoolQC)
full["MiscFeature"] = lab.fit_transform(full.MiscFeature)
full["Fence"] = lab.fit_transform(full.Fence)
full["FireplaceQu"] = lab.fit_transform(full.FireplaceQu)
full["GarageQual"] = lab.fit_transform(full.GarageQual)
full["GarageCond"] = lab.fit_transform(full.GarageCond)
full["GarageFinish"] = lab.fit_transform(full.GarageFinish)
full["GarageYrBlt"] = full["GarageYrBlt"].astype(str)
full["GarageYrBlt"] = lab.fit_transform(full.GarageYrBlt)
full["GarageType"] = lab.fit_transform(full.GarageType)
full["BsmtExposure"] = lab.fit_transform(full.BsmtExposure)
full["BsmtCond"] = lab.fit_transform(full.BsmtCond)
full["BsmtQual"] = lab.fit_transform(full.BsmtQual)
full["BsmtFinType2"] = lab.fit_transform(full.BsmtFinType2)
full["BsmtFinType1"] = lab.fit_transform(full.BsmtFinType1)
full["MasVnrType"] = lab.fit_transform(full.MasVnrType)
full["BsmtFinType1"] = lab.fit_transform(full.BsmtFinType1)
n_train=train.shape[0]#训练集的行数
X = pipeline_data[:n_train]#取出处理之后的训练集
test_X = pipeline_data[n_train:]#取出n_train后的数据作为测试集
y= train.SalePrice
X_scaled = StandardScaler().fit(X).transform(X)#做转换
y_log = np.log(train.SalePrice)##这里要注意的是,更符合正态分布
#得到测试集
test_X_scaled = StandardScaler().fit_transform(test_X)
模型构建
from sklearn.tree import DecisionTreeRegressor#导入模型
model = DecisionTreeRegressor()
model1 =model.fit(X_scaled,y_log)
predict = np.exp(model1.predict(test_X_scaled))##np.exp是对上面的对数变换之后的反变换
result=pd.DataFrame({'Id':test.Id, 'SalePrice':predict})
result.to_csv("submission1.csv",index=False)