本文参考七月在线内容。所有数据从kaggle网站获取
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei'] #指定默认字体
from sklearn.linear_model import Ridge
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
#读取数据
train_df = pd.read_csv('G:/KNNtest/HousePrice/train.csv',index_col=0)
test_df = pd.read_csv('G:/KNNtest/HousePrice/test.csv',index_col=0)
'''
1、鉴于kaggle项目train_df和test_df都已给出,做处理时可以合并起来一起处理,另外train_df中标签项
即SalePrice需要先取出
'''
#log1p, 也就是 log(x+1),将label平滑化
prices = pd.DataFrame({"price":train_df["SalePrice"], "log(price + 1)":np.log1p(train_df["SalePrice"])})
# plt.ylabel(u"数量")
# plt.tit