import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import gensim
import jieba
import os
import sys
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import preprocessing
import lightgbm as lgb
from sklearn.metrics import classification_report
import numpy as np
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
加载数据以及分词
data = pd.read_excel('复旦大学中文文本分类语料.xlsx')import jieba
jieba.enable_parallel(18)#并行分词开启
data['文本分词']= data['正文'].apply(lambda i:jieba.cut(i))
data['文本分词']=[' '.join(i)for i in data['文本分词']]
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.715 seconds.
Prefix dict has been built succesfully.
将标签转换为文本标签
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data.分类.values)