数据集下载易一网络科技 - 付费文章www.intumu.com
加载数据
import pandas as pd
df=pd.read_csv("MYUNOI.csv") # "石景山","丰台","朝阳","海淀","东城","西城"
df.head()
dist roomnum halls AREA floor subway school price 0 chaoyang 1 0 46.06 middle 1 0 48850 1 chaoyang 1 1 59.09 middle 1 0 46540 2 haidian 5 2 278.95 high 1 1 71662 3 haidian 3 2 207.00 high 1 1 57972 4 fengtai 2 1 53.32 low 1 1 71268
import warnings
warnings.filterwarnings("ignore")
是否有空值
df.isnull().any()
dist False
roomnum False
halls False
AREA False
floor False
subway False
school False
price False
dtype: bool
数据分割
X0, y = df.iloc[:,1:-1],df.iloc[:,-1]
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder,OneHotEncoder
# 对非数字列进行编码
X = OrdinalEncoder().fit_transform(X0)
# X = OneHotEncoder().fit_transform(X) #http://localhost:8888/notebooks/20190819yellowbrick/08-Learning
# y = LabelEncoder().fit_transform(y0)
特征分析
features = X0.columns.tolist()
from yellowbrick.features.importances import FeatureImportances
model = LassoCV(alphas=alphas,cv=5)
viz = FeatureImportances(model, labels=features)
viz.fit(X, y)
viz.poof()
注:学区、地铁,这是必然的!
Alpha参数选取
import numpy as np
from sklearn.linear_model import LassoCV
from yellowbrick.regressor import AlphaSelection
alphas = np.logspace(-10, 1, 400)
# 可视化及验证
model = LassoCV(alphas=alphas,cv=5)
visualizer = AlphaSelection(model)
visualizer.fit(X, y)
visualizer.poof()
使用Alpha参数评估预测结果
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.linear_model import Ridge
from yellowbrick.regressor import PredictionError
visualizer = PredictionError(Ridge(alpha=0.4381))
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.poof()
学习曲线
from sklearn.linear_model import RidgeCV
from yellowbrick.model_selection import LearningCurve
model = RidgeCV()
visualizer = LearningCurve(model, scoring='r2')
visualizer.fit(X, y)
visualizer.poof()
持久化(略)
新手可查阅历史目录:yeayee:Python数据分析及可视化实例目录zhuanlan.zhihu.com