import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
import scipy.stats as sci
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import statsmodels.formula.api as smf
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
#copyright by Interstellar-Ark-AI on 2020_5_23
#读取数据集-查看数据形状和类型以及分布
data = pd.read_csv("house_train.csv")
data.head() #查看数据
data.shape #查看数据集形状
print(data['price'].describe()) #查看价值目标类型和分布、有无异常值
sns.distplot(data['price']) # 查看价格分布的散点图
plt.show()
#预处理和数据清洗,去除价格-面积的离群点
sns.jointplot(x='area',y='price',data=data) #查看价格和面积的分布图
plt.show()
index_del = data[(data['area'] > 100) & (data['price'] < 700)].index
data.drop(index=index_del, inplace=True) #噪声去除
sns.jointplot(x='area',y='price',data=data) #再次查看价格和面积的分布图
plt.show()
#预处理和数据清洗,去除价格-PM2.5的离群点
sns.jointplot(x='pm25',y='price',data=data) #查看价格和PM2.5的分布图
plt.show()
index_del = data[(data['pm25'] > 80) & (data['price'] < 400)].index
data.drop(index=index_del, inplace=True) #噪声去除
sns.jointplot(x='pm25',y='price',data=data) #再次查看价格和PM2.5的分布图
plt.show()
#预处理和数据清洗,去除犯罪率-价格的离群点
sns.jointplot(x='crime_rate',y='price',data=data) #查看犯罪率和价格的分布图
plt.show()
index_del = data[(data['crime_rate'] > 7.5) & (data['price'] < 500)].index
data.drop(index=index_del, inplace=True) #噪声去除
sns.jointplot(x='crime_rate',y='price',data=data) #再次查看犯罪率和价格的分布图
plt.show()
#进行预处理数据清洗,查看什么特征拥有空值,去除空值列
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum() / data.isnull().count()).sort_values(ascending = False)
missing_data = pd.concat([total,percent],axis = 1,keys = ['Total','Percent'])
data['distirct'] = data['distirct'].astype(str) #对于distirct数据,需要转化为object类型进行分类
print(missing_data.head(13)) #查看空值列
data1 = data.drop(['id'], axis = 1) #删除id列,id列是无效的数据
data2 = data1.drop(data1.loc[data1['crime_rate'].isnull()].index) #删除犯罪率为空的行
data3 = data2.drop(data2.loc[data2['green_rate'].isnull()].index) #删除绿化率为空的行
print(data3.isnull().sum().max()) #检查最后数据
#可以使用热力图表示变量相关性
plt.rcParams['figure.figsize'] = (15, 10)# 计算相关系数
corrmatrix = data3.corr()
sns.heatmap(corrmatrix,square = True,vmax = 1,vmin = -1,center = 0.0,cmap = 'coolwarm')
plt.show()
#特征工程:求出各个特征之间的皮尔逊相关系数
k = 10 #十个特征
cols = corrmatrix.nlargest(k,'price')['price'].index
cm = np.corrcoef(data3[cols].values.T)
sns.set(font_scale = 1.25)
hm = sns.heatmap(cm,cmap = 'RdPu',annot = True,square = True,fmt = '.2f',annot_kws = {'size':10},yticklabels = cols.values,xticklabels = cols.values)
plt.show()
#算法1:ols回归分析数据建模
feature_data = data3.drop(['price'],axis = 1)
target_data = data3['price'] #预测目标列为price
X_train,X_test,y_train, y_test = train_test_split(feature_data, target_data, test_size = 0.3) #划分数据集和测试集进行交叉验证
df_train = pd.concat([X_train,y_train],axis=1)
print(data['distirct'].describe())
lr_model = ols("price~area+C(floor)+C(oriented)+crime_rate+pm25+C(distirct)",data = df_train).fit()