Sklearn 机器学习模型基础
Scikit-learn(简称Sklearn)是Python中最流行的机器学习库之一,提供了丰富的工具用于数据挖掘和数据分析。以下是一些基础模型的实现示例。
线性回归模型
线性回归用于预测连续值。以下是一个简单的线性回归示例:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np
# 生成样本数据
X = np.array([[1], [2], [3], [4], [5]])
y = np.array([1, 3, 2, 3, 5])
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建并训练模型
model = LinearRegression()
model.fit(X_train, y_train)
# 预测
predictions = model.predict(X_test)
print(predictions)
逻辑回归模型
逻辑回归用于分类问题。以下是二分类问题的示例:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
# 加载数据
data = load_iris()
X = data.data[:100] # 只取前两类
y = data.target[:100]
# 创建并训练模型
model = LogisticRegression()
model.fit(X, y)
# 预测
print(model.predict([[5.1, 3.5, 1.4, 0.2]]))
决策树与随机森林
决策树和随机森林是强大的分类和回归工具。
决策树分类器
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
# 加载数据
data = load_breast_cancer()
X, y = data.data, data.target
# 创建并训练模型
model = DecisionTreeClassifier(max_depth=4)
model.fit(X, y)
# 预测
print(model.predict([data.data[0]]))
随机森林回归
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
# 生成回归数据
X, y = make_regression(n_features=4, n_samples=100)
# 创建并训练模型
model = RandomForestRegressor(n_estimators=100)
model.fit(X, y)
# 预测
print(model.predict([[0, 1, 0.5, 1]]))
支持向量机(SVM)
SVM可用于分类和回归问题。
SVM分类器
from sklearn.svm import SVC
from sklearn.datasets import load_wine
# 加载数据
data = load_wine()
X, y = data.data, data.target
# 创建并训练模型
model = SVC(kernel='linear')
model.fit(X, y)
# 预测
print(model.predict([data.data[0]]))
SVM回归
from sklearn.svm import SVR
from sklearn.datasets import make_regression
# 生成回归数据
X, y = make_regression(n_features=4, n_samples=100)
# 创建并训练模型
model = SVR(kernel='rbf')
model.fit(X, y)
# 预测
print(model.predict([[0, 1, 0.5, 1]]))
模型评估
评估模型性能是机器学习工作流中的重要环节。
分类评估指标
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
# 加载数据
data = load_iris()
X, y = data.data, data.target
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# 训练模型
model = RandomForestClassifier()
model.fit(X_train, y_train)
# 评估
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))
回归评估指标
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
# 生成数据
X, y = make_regression(n_features=4, n_samples=100)
# 训练模型
model = LinearRegression()
model.fit(X, y)
# 评估
predictions = model.predict(X)
print(mean_squared_error(y, predictions))
模型保存与加载
训练好的模型可以保存以便后续使用。
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
# 加载数据
data = load_iris()
X, y = data.data, data.target
# 训练模型
model = RandomForestClassifier()
model.fit(X, y)
# 保存模型
joblib.dump(model, 'iris_model.pkl')
# 加载模型
loaded_model = joblib.load('iris_model.pkl')
print(loaded_model.predict([data.data[0]]))
超参数调优
调整模型参数可以提高性能。
网格搜索
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.datasets import load_iris
# 加载数据
data = load_iris()
X, y = data.data, data.target
# 定义参数网格
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
# 创建网格搜索
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X, y)
# 最佳参数
print(grid_search.best_params_)
随机搜索
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
import numpy as np
# 加载数据
data = load_iris()
X, y = data.data, data.target
# 定义参数分布
param_dist = {'n_estimators': np.arange(50, 200, 10),
'max_depth': np.arange(3, 10)}
# 创建随机搜索
random_search = RandomizedSearchCV(RandomForestClassifier(), param_dist, n_iter=10, cv=5)
random_search.fit(X, y)
# 最佳参数
print(random_search.best_params_)
特征工程
特征工程是提高模型性能的关键步骤。
特征缩放
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
# 加载数据
data = load_iris()
X = data.data
# 特征缩放
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(X_scaled.mean(axis=0)) # 均值接近0
print(X_scaled.std(axis=0)) # 标准差接近1
特征选择
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.datasets import load_iris
# 加载数据
data = load_iris()
X, y = data.data, data.target
# 选择最好的2个特征
selector = SelectKBest(f_classif, k=2)
X_new = selector.fit_transform(X, y)
print(X_new.shape) # (150, 2)
管道(Pipeline)
Pipeline可以将多个处理步骤组合成一个整体。
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
# 加载数据
data = load_iris()
X, y = data.data, data.target
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# 创建管道
pipe = Pipeline([
('scaler', StandardScaler()),
('classifier', SVC())
])
# 训练
pipe.fit(X_train, y_train)
# 评估
print(pipe.score(X_test, y_test))
集成方法
集成方法可以组合多个模型以提高性能。
投票分类器
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
# 加载数据
data = load_iris()
X, y = data.data, data.target
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# 创建投票分类器
model = VotingClassifier(estimators=[
('lr', LogisticRegression()),
('dt', DecisionTreeClassifier()),
('svc', SVC())
], voting='hard')
# 训练
model.fit(X_train, y_train)
# 评估
print(model.score(X_test, y_test))
梯度提升树(GBDT)
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
# 加载数据
data = load_breast_cancer()
X, y = data.data, data.target
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# 创建模型
model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)
# 训练
model.fit(X_train, y_train)
# 评估
print(model.score(X_test, y_test))
通过以上示例,可以了解Sklearn中各种常见机器学习模型的基本使用方法。实际应用中,需要根据具体问题选择合适的模型,并进行适当的参数调整和特征工程。