sklearn使用之Pipeline、FeatureUnion、GridSearchCV代码示例
示例一:使用pipeline串联处理操作、使用GridSearchCV搜索最佳参数组合
from sklearn.feature_extraction.text import CountVectorizer #计数矢量器
from sklearn.feature_extraction.text import TfidfVectorizer #Tf-idf矢量器
from sklearn.naive_bayes import MultinomialNB #多项式朴素贝叶斯
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
X = tweets['SentimentText'] #训练文本
y = tweets['Sentiment'] #文本标签
'''
示例: 使用CountVectorizer 和 MultinomialNB 对文本训练样本做分类预测
使用Pipeline和GrideSearchCV做流程处理和最佳参数搜索
'''
# 实例一个 pipeline
pipe = Pipeline([('vect', CountVectorizer()), ('classify', MultinomialNB())])
# 设置 pipeline 参数
pipe_params = {'vect__ngram_range':[(1, 1), (1, 2)],'vect__max_features':[1000, 10000], 'vect__stop_words':[None, 'english']}
# 实例一个 gridsearch
grid = GridSearchCV(pipe, pipe_params)
# 训练 gridsearch
grid.fit(X, y)
#查看结果
print (grid.best_score_, grid.best_params_)
示例二:使用FeatureUnion合并特征
from sklearn.pipeline import FeatureUnion
#联合两个矢量器生成的特征
featurizer = FeatureUnion([('tfidf_vect', TfidfVectorizer()), ('count_vect',
CountVectorizer())])
#设置两个矢量器的参数
featurizer.set_params(tfidf_vect__max_features=100, count_vect__ngram_range=(1, 2),
count_vect__max_features=300)
#特征合并
_ = featurizer.fit_transform(X)
print (_.shape)
示例三:使用FeatureUnion合并特征、pipeline串联处理操作、GridSearchCV搜索最佳参数组合
#创建一个新的pipeline,串联起来特征生成和合并、分类模型训练
pipe = Pipeline([('featurizer', featurizer), ('classify', MultinomialNB())])
#设置参数范围
pipe_params = {
'featurizer__count_vect__ngram_range':[(1, 1), (1, 2)], 'featurizer__count_vect__max_features':[1000, 10000], 'featurizer__count_vect__stop_words':[None, 'english'],
'featurizer__tfidf_vect__ngram_range':[(1, 1), (1, 2)],
'featurizer__tfidf_vect__max_features':[1000, 10000],
'featurizer__tfidf_vect__stop_words':[None, 'english']
}
#网格搜索最佳参数组合
grid = GridSearchCV(pipe, pipe_params)
grid.fit(X, y)
print (grid.best_score_, grid.best_params_)