- 数据读取
import pandas as pd
import jieba
data = pd.read_csv(r"E:\数据\实验data\messages.csv",encoding='gbk', header=0, ,names=[“ID”,'label','text'])
#print(data.head())
- 短信分词
data['cut_message'] = data["text"].apply(lambda x:' '.join(jieba.cut(x)))#使用空格连接分词
#print(data.head())
x = data['cut_message'].values
y = data['label'].values
- 训练集、测试集划分
from sklearn.cross_validation import train_test_split
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.1)#测试集:训练集 =1:9
- 模型训练与预测
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
vectorizer = CountVectorizer()
x_train_termcounts = vectorizer.fit_transform(train_x)
tfidf_transformer = TfidfTransforme