from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
#就是把几个数据处理的操作组装起来,按顺序走
pl_svm = Pipeline([
('normal',Normalizer()), #归一化
('PCA',PCA()), #降维
('SVC',SVC()) #支持向量机分类
])
X_train = [[1,2],[2,3],[-1,-3],[-5,-6],[6,6],[-2,-1]]
Y_train = [1,1,0,0,1,0]
clf = pl_svm.fit(X_train,Y_train)
print(clf.predict(X_train))
X_test = [[2,2],[10,10],[-10,-10],[-5,-9]]
print(clf.predict(X_test))
再看看在机器学习实战/第一部分第二章程序里的应用
导入库并加载数据
# 加载库
import os
import tarfile
import numpy as np
import pandas as pd
from six.moves import urllib
#DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
#HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
#当数据会定期发送变换,还可以写个小脚本。
# def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
# os.makedirs(housing_path, exist_ok=True)
# tgz_path = os.path.join(housing_path, "housing.tgz")
# urllib.request.urlretrieve(housing_url, tgz_path)
# housing_tgz = tarfile.open(tgz_path)
# housing_tgz.extractall(path=housing_path)
# housing_tgz.close()
# fetch_housing_data()
# 加载数据
def load_housing_data(housing_path=HOUSING_PATH):
csv_path=os.path.join(housing_path,"housing.csv")
return pd.read_csv(csv_path)
查看数据并将特征分为数值特征和分类特征
# 查看数据结构
housing = load_housing_data()
housing.head() # 查看前五行的信息
housing_cat = housing[['ocean_proximity']]
housing_num = housing.select_dtypes(include=[np.number])
设计转换器
转换器功能:添加新的特征bedrooms_per_room和bedrooms_per_room
from sklearn.base import BaseEstimator, TransformerMixin
# get the right column indices: safer than hard-coding indices 3, 4, 5, 6
#取需要运算的数据
rooms_ix, bedrooms_ix, population_ix, household_ix = [
list(housing.columns).index(col)
for col in ("total_rooms", "total_bedrooms", "population", "households")]
class CombinedAttributesAdder(BaseEstimator, TransformerMixin): #组合属性加法器
def __init__(self, add_bedrooms_per_room = True): # no *args or **kwargs
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X, y=None):
rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household, #把数组array1和数组array2配对后输出
bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
后面pipeline调用的是FunctionTransformer,但需要前面那段程序的返回值
from sklearn.preprocessing import FunctionTransformer
def add_extra_features(X, add_bedrooms_per_room=True): #增加其他特征函数 add_bedrooms_per_room 标志
rooms_per_household = X[:, rooms_ix] / X[:, household_ix] #每户房间数
population_per_household = X[:, population_ix] / X[:, household_ix] #每户人数
if add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix] #房间中卧室占比
return np.c_[X, rooms_per_household, population_per_household,
bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
attr_adder = FunctionTransformer(add_extra_features, #参数func: 自定义函数,及增加其他特征
validate=False, #validate: bool量, default=False 输入验证关
kw_args={"add_bedrooms_per_room": False}) #kw_argsdict, default=None要传递给 func 的附加关键字参数的字典。
使用pipeline处理数值特征
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler #估算器
num_pipeline = Pipeline([ #制作针对数字特征的转换器
('imputer', SimpleImputer(strategy="median")), #将缺失值替换成中位数
('attribs_adder', FunctionTransformer(add_extra_features, validate=False)), #加入特征
('std_scaler', StandardScaler()), #标准化:减去均值,然后除以标准差
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_num_tr
使用列转换器ColumnTransformer处理所有特征
from sklearn.compose import ColumnTransformer #列转换器
from sklearn.preprocessing import OneHotEncoder #OneHot编码器
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs), #对数字特征应用数字特征pipeline转换器
("cat", OneHotEncoder(), cat_attribs), #对分类特征使用OneHot编码器,即特征“靠近海边”分列用布尔量表示
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared