网格搜索法调参
用5000条数据,调参时采用五折交叉验证的方式
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
import gensim
import time
import pickle
import csv,sys
# read data
df = pd.read_csv('data/train_set.csv', nrows=5000)
df.drop(columns='article', inplace=True)
# observe data
# print(df['class'].value_counts(normalize=True, ascending=False))
# TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, sublinear_tf=True)
vectorizer.fit(df['word_seg'])
x_train = vectorizer.transform(df['word_seg'])
# split training set and validation set
predictor = ['word_seg']
x_train, x_validation, y_train, y_validation = train_test_split(x_train, df['class'], test_size=0.2)
clf = LogisticRegression(C=10, max_iter=20)
clf = svm.LinearSVC(C=1, max_iter=20)
clf = lgb.sklearn.LGBMClassifier(learning_rate=0.1, n_estimators=50, num_leaves=10)
algorithms=[
LogisticRegression(C=10, max_iter=20),
svm.LinearSVC(C=1, max_iter=20),
]
full_predictions = []
for alg in algorithms:
# Fit the algorithm using the full training data.
alg.fit(x_train, y_train)
# Predict using the test dataset. We have to convert all the columns to floats to avoid an error.
predictions = alg.decision_function(x_validation.astype(float))
full_predictions.append(predictions)
y_prediction = (full_predictions[0] + full_predictions[1]) / 2
# adjust labels from 1 to 19
y_prediction = np.argmax(y_prediction, axis=1)+1
# # grid search for model
# param_grid = {
# 'num_leaves': [10, 20, 30],
# 'learning_rate': [0.01, 0.05, 0.1],
# 'n_estimators': [10, 20, 50]
# }
# gbm = GridSearchCV(clf, param_grid, cv=5, scoring='f1_micro', n_jobs=4, verbose=1)
# gbm.fit(x_train, y_train)
# print('网格搜索得到的最优参数是:', gbm.best_params_)
# test model
label = []
for i in range(1, 20):
label.append(i)
f1 = f1_score(y_validation, y_prediction, labels=label, average='micro')
print('The F1 Score: ' + str("%.4f" % f1))