python LinearRegression建模 笔记(一)
主要用到的包:
pd.set_option
matplotlib.pyplotas plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessingimport Imputer
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
设置结果展示布局
import pandas as pd
pd.set_option("display.max_columns",1000)
pd.set_option("display.width",1000)
pd.set_option("display.max_colwidth",1000)
对特征计数 :离散数据每个元素个数
housing.ocean_proximity.value_counts()
画图了解特征,hist 直方图
import matplotlib.pyplot as plt
plt.hist(x,bins,figsize=(20,15) #直方图
plt.scatter(x,y,alpha,c,s,cmap=plt.get_cmap("jet"))#散点图
分测试集和训练集
from sklearn.model_selection import train_test_split
x,y=train_test_split(data,test_size=0.3,random_stata=42)
线性相关性
corr_matrix=data.corr()
corr_matrix.house_value.sort_values(ascending=False)
非线性相关性,散点图矩阵
法一:
from pandas.tools.plotting import scatter_matrix
scatter_matrix(housing[attributes],figsize=(20,10))
法二:
import seaborn as sns
sns.pairplot(housing[attributes],size=5)
删除某一列
data.drop("列名",axis=1)
###axis=0表示跨行,axis=1表示跨列
填充缺失值
1去除含有缺失值的记录
data.dropna(subset=["列名"])
2去除某一缺失值字段
data.drop("列名”,axis=1)
3填充缺失值
data.列名.fillna(median)
sklearn.preprocessing中的Imputer类
from sklearn.preprocessing import Imputer
imputer=Imputer(strategy="median")
imputer.fit(data)
文本特征处理Scikit—learn中的Onehotencoder
步骤一:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
housing_cat_encoded=encoder.fit_transform(housing_cat)
步骤二:
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder()
housing_cat_hot=encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
from sklearn.preprocessing import LabelBinarizer
encoder=LabelBinarizer()
housing_cat_1hot=encoder.fit_transform(np.array(housing_cat_encoded))
特征缩放
preprocessing.scale (X,axis=0, with_mean=True, with_std=True, copy=True)
缩放不能针对整个数据集
封装这些操作sklearn.pipeline
from sklearn.pipeline import Pipeline
from s**klearn.preprocessing** import **StandardScaler**
num_pipeline=Pipeline([
("imputer",Imputer(strategy="median"))
("attribs_adder",CombinedAttributesAdder())
("Std_scaler",StandardScaler())
])
housing_num_str=num_pipeline.fit_transform(housing_num)