首先是一个大概的开发步骤:
附上代码,来自贪心学院
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
#读取文件参数
data = pd.read_csv('ISEAR.csv', header=None)
labels = data[0].values.tolist()
sents = data[1].values.tolist()
X_train, X_text, y_train, y_text = train_test_split(sents, labels, test_size=0.2, random_state=42)
#转换成TF向量
vectorized = TfidfVectorizer()
X_train = vectorized.fit_transform(X_train)
X_text = vectorized.transform(X_text)
#使用逻辑回归
paramter = {'C':[0.001, 0.01, 0.1, 0.05, 0.5, 2, 3, 10, 5]}
lr = LogisticRegression()
lr.fit(X_train, y_train).score(X_text, y_text)
#自动调参
clf = GridSearchCV(lr, paramter, cv=5)
clf.fit(X_train, y_train)
clf.score(X_text,y_text)
print(clf.best_params_)
result = confusion_matrix(y_text, clf.predict(X_text))
print(result)