import os
import random
import jieba
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
stopwords=pd.read_csv("E:/datas/stopwords.txt",encoding='utf-8',quoting=3,index_col=False,sep="\t",names=['stopword'])
stopwords.head()
stopwords=stopwords['stopword'].values
folder_path='E:/datas/THUCNews/THUCNews'
folder_list = os.listdir(folder_path) # 查看folder_path下的文件
sentences= []
# 遍历每个子文件夹
for folder in folder_list:
new_folder_path = os.path.join(folder_path, folder) # 根据子文件夹,生成新的路径
files = os.listdir(new_folder_path) # 存放子文件夹下的txt文件的列表
j = 1
# 遍历每个txt文件
for file in files:
if j >10000: # 每类txt样本数最多10000个
break
with open(os.path.join(new_folder_path, file), 'r', encoding='utf-8') as f: # 打开txt文件
raw = f.read()
word_list = jieba.lcut(raw, cut_all=False)
word_list = filter(lambda x:len(x)>1,word_list)
word_list = filter(lambda x: x not in stopwords,word_list)
sentences.append((" ".join(word_list),folder))
j += 1
random.shuffle(sentences)
from sklearn.model_selection import train_test_split
x, y = zip(*sentences)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(len(x_train))
len(x_test)
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
class TextClassifier():
def __init__(self, classifier=MultinomialNB()):
self.classifier = classifier
self.vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,4), max_features=20000)
def features(self, X):
return self.vectorizer.transform(X)
def fit(self, X, y):
self.vectorizer.fit(X)
self.classifier.fit(self.features(X), y)
def predict(self, X):
return self.classifier.predict(self.features(X))
def score(self, X, y):
return self.classifier.score(self.features(X), y)
text_classifier = TextClassifier()
text_classifier.fit(x_train, y_train)
print(text_classifier.score(x_test, y_test))
y_pred = text_classifier.predict(x_test)
print(classification_report(y_test, y_pred)) # 查看各类指标
print(confusion_matrix(y_test, y_pred)) # 查看混淆矩阵
CountVectorizer、MultinomialNB()
TfidfVectorizer、MultinomialNB()
CountVectorizer、classifier=SVC
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
class TextClassifier():
def __init__(self, classifier=SVC(kernel='linear')):
self.classifier = classifier
self.vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,4), max_features=20000)
def features(self, X):
return self.vectorizer.transform(X)
def fit(self, X, y):
self.vectorizer.fit(X)
self.classifier.fit(self.features(X), y)
def predict(self, X):
return self.classifier.predict(self.features(X))
def score(self, X, y):
return self.classifier.score(self.features(X), y)
text_classifier = TextClassifier()
text_classifier.fit(x_train, y_train)
print(text_classifier.score(x_test, y_test))
y_pred = text_classifier.predict(x_test)
print(classification_report(y_test, y_pred)) # 查看各类指标
print(confusion_matrix(y_test, y_pred)) # 查看混淆矩阵
TfidfVectorizer、classifier=SVC