1.导入
import jieba
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
2.创建词袋以及分词/提取词干函数:
tfidv = TfidfVectorizer(lowercase=False)
def tokenizer_accurate(text):
s = jieba.lcut(text, cut_all=False) # 精确模式
return s
def tokenizer_all(text):
s = jieba.lcut(text, cut_all=True) # 全模式
return s
def tokenizer_search(text):
s = jieba.lcut_for_search(text) # 搜索引擎模式
return s
3.中文停用词:
在网上下载一个文本然后:
def chinese_stop():
f = open('中文停用词.txt', 'r', encoding='utf-8')
list1 = []
for i in f.readlines():
list1.append(i.strip())
return list1
4.构建模型
stop_words = china_stop()
tfidv = TfidfVectorizer(lowercase=False)
grid_params = {'venc__tokenizer': [tokenizer_all, tokenizer_accurate, tokenizer_search],
'venc__ngram_range': [(1, 1)],
'venc__stop_words': [stop_words, None],
'lgn__C': [1.0, 10.0, 15.0],
'lgn__penalty': ['l2','l1'],
'lgn__solver': ['liblinear']}
pipe_lr = Pipeline([('venc', tfidv), ('lgn', LogisticRegression(random_state=12, max_iter=1000))])
gs = GridSearchCV(pipe_lr, grid_params, scoring='accuracy', n_jobs=1, cv=10, verbose=3)
当verbose参数>=2时,显示运行细节
可以把cv设为5