kaggle中的竞赛是练习数据分析的好地方,最近我也尝试着入门kaggle数据分析,并把自己的想法写出来,排名不好,大家有想法的可以一起讨论哈!
#导入各种模块,用的是增强决策树算法和梯度回归算法
import numpy as np
import pandas as pd
from sklearn import cross_validation
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor
#读入数据
train_file_path = r'E:/study/python/keggle/House/train.csv'
test_file_path = r'E:/study/python/keggle/House/test.csv'
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)
#数据整理
train_clean_df = train_df.drop(['Id','Alley','PoolQC','Fence','MiscFeature','FireplaceQu'],axis=1) #将数据中空值率高的列删除掉
#用平均值填充缺LotFrontage的失值
LotFrontage_mean = trai