from sklearn import preprocessing
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = load_boston()
x = data.data
y = data.target
# scale = preprocessing.StandardScaler()
# x = scale.fit_transform(x)
# df = pd.DataFrame(x,columns=data.feature_names)
std = preprocessing.MinMaxScaler()
x = std.fit_transform(x)
df = pd.DataFrame(x,columns=data.feature_names)
df['price'] = y
corr = df.corr()['price'] # 求每个 series 和 price 的相关性
corr = corr.abs() # 取相关性绝对值大的作为有效特征
img = corr.plot(kind='bar')
plt.show()
new_x = df[['INDUS','RM','PTRATIO','LSTAT']] # 最相关的四个特征
# new_x = df[['INDUS','RM','PTRATIO','LSTAT']]
x_train, x_test, y_train, y_test = train_test_split(new_x,y,test_size=0.33, random_state=15)
linear_regressor = LinearRegression()
linear_regressor.fit(x_train,y_train)
score = linear_regressor.score(x_test,y_test)
# decision_regressor = DecisionTreeRegressor()
# decision_regressor.fit(x_train,y_train)
# score = decision_regressor.score(x_test,y_test)
print(score)