文章目录
支持向量机
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import *
# 导入数据
train_data= pd.read_csv('F:\PY-Learning\CNEWS\cnews\cnews.train.txt', names=['title', 'content'], sep='\t', engine='python', encoding='UTF-8') # (50000, 2)
test_data = pd.read_csv('F:\PY-Learning\CNEWS\cnews\cnews.test.txt', names=['title', 'content'], sep='\t',engine='python',encoding='UTF-8') # (10000, 2)
val_data = pd.read_csv('F:\PY-Learning\CNEWS\cnews\cnews.val.txt', names=['title', 'content'], sep='\t',engine='python',encoding='UTF-8') # (5000, 2)
x_train = train_data['content']
x_test = test_data['content']
x_val = val_data['content']
y_train = train_data['title']
y_test = test_data['title']
y_val = val_data['title']
# print(y_val)
###################################################
#############处理样本#################################
## 默认不去停用词的向量化
count_vec = CountVectorizer()
x_count_train = count_vec.fit_transform(x_train )
x_count_test = count_vec.transform(x_test )
## 去除停用词
count_stop_vec = CountVectorizer(analyzer='word', stop_words='english')
x_count_stop_train = count_stop_vec.fit_transform(x_train)
x_count_stop_test = count_stop_vec.transform(x_test)
## 模型训练
mnb_count = SVC()
mnb_count.fit(x_count_train, y_train)
mnb_count_y_predict = mnb_count.predict(x_count_test)
mnb_count.score(x_count_test, y_test)
## TF−IDF处理后在训练
## 默认配置不去除停用词
tfid_vec = TfidfVectorizer()
x_tfid_train = tfid_vec.fit_transform(x_train)
x_tfid_test = tfid_vec.transform(x_test)
## 模型训练
mnb_tfid = SVC()
mnb_tfid.fit(x_tfid_train, y_train)
mnb_tfid_y_predict = mnb_tfid.predict(x_tfid_test)
mnb_tfid.score(x_tfid_test, y_test)