1、读取数据
import pandas as pd
data = pd.read_csv("boston_housing.csv")
#获取前五条数据
data.head()
# 数据基本信息
data.info()
# 查看是否有空值
data.isnull().sum()
# 各属性的统计特性
data.describe()
2、单变量分析
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")
plt.figure()
sns.distplot(data.MEDV.values, bins=30, kde=True)
plt.xlabel('Median value of owner-occupied homes', fontsize=12)
plt.show()
# 单个特征散点图
plt.scatter(range(data.shape[0]), data["MEDV"].values,color='purple')
plt.title("Distribution of Price");
# 删除y大于50的样本
data = data[data.MEDV < 50]
# 输出数组的行列数 0:行数 1:列数
data.shape
#使用条形显示每个分类箱中的观察计数
sns.countplot(data.CHAS);
plt.xlabel('Charles River');
plt.ylabel('Number of occurrences');
3、两两特征之间的相关性
# Calculates pearson co-efficient for all combinations,通常认为相关系数大于0.5的为强相关
data_corr = data.corr().abs()
plt.subplots(figsize=(13, 9))
sns.heatmap(data_corr,annot=True)
# Mask unimportant features
sns.heatmap(data_corr, mask=data_corr < 1, cbar=False)
plt.savefig('house_coor.png' )
plt.show()
#get the names of all the columns
cols=data.columns
#Set the threshold to select only highly correlated attributes
threshold = 0.5
# List of pairs along with correlation above threshold
corr_list = []
#size = data.shape[1]
size = data_corr.shape[0]
#Search for the highly correlated pairs
for i in range(0, size): #for 'size' features
for j in range(i+1,size): #avoid repetition
if (data_corr.iloc[i,j] >= threshold and data_corr.iloc[i,j] < 1) or (data_corr.iloc[i,j] < 0 and data_corr.iloc[i,j] <= -threshold):
corr_list.append([data_corr.iloc[i,j],i,j]) #store correlation and columns index
#Sort to show higher ones first
s_corr_list = sorted(corr_list,key=lambda x: -abs(x[0]))
#Print correlations and column names
for v,i,j in s_corr_list:
print ("%s and %s = %.2f" % (cols[i],cols[j],v))
#打印变量相关性的散点图
for v,i,j in s_corr_list:
sns.pairplot(data, size=6, x_vars=cols[i],y_vars=cols[j] )
plt.show()