通常机器学习最难的是选择合适的estimator,不同的estimator适用于不同的数据集和问题
def gen_estimators():
'''
List of the different estimators.
'''
estimators = [
('Lasso regression',linear_model.Lasso(alpha=0.1),True),
('Ridge regression',linear_model.Ridge(alpha=0.1),True),
('Hinge regression',linear_model.Hinge(),True),
('Lassolars regression',linear_model.LassoLars(alpha=0.1),True),
('OrthogonalMatchingPursuitCV regression',linear_model.OrthogonalMatchingPursuitCV(),True),
('BayesianRidge regression',linear_model.BayesianRidge(),True),
('PassiveAggressiveRegressor regression',linear_model.PassiveAggressiveRegressor(),True),
('HuberRegressor regression',linear_model.Huber(),True),
('LogisticRegression regression',linear_model.LogisticRegression(),True),
]
return estimators
遍历,观察得分
def cross_validate():
for name,clf,flag in gen_estimators():
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=0)
clf.fit(x_train,y_train)
print(name,'\n',clf.coef_)
scores = cross_val_score(clf,X,y,cv=5,scoring='roc_aus')
print(scores)
scikit-learn交叉验证
hold测试:训练集和测试集分割
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
iris = load_iris()
X = iris.data
y = iris.target
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=0) #随机数种子,没有设置,得到的划分每次都不一样
print(y_train)
print(y_test)