数据集为AG News
1. 清洗数据
1.1. 将csv格式数据转为list
import csv
def csv_to_list(filename):
with open(filename, 'r') as f:
reader = csv.reader(f)
res = []
for row in reader:
res.append([row[0],row[1]+' '+row[2]])
del(res[0])
return res
1) csv第一行为表头,通过del删去
2) 数据集第二列为title,第三列为description,经过实验发现忽略title的准确率为0.888289,将title并入后准确率为0.896316,拥有了一定的提升,因此在这里将title与description合并
1.2. 语料预处理
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
def clean_list(res):
res1 = [[samp[0], re.sub(r'[-|\\]', ' ', samp[1])]for samp in res]
res2 = [[samp[0], word_tokenize(samp[1])]for samp in res1]
res3 = [[samp[0], [w.lower() for w in samp[1]]]for samp in res2]
res4 = [[samp[0], [w for w in samp[1] if re.search('^[a-z]+$', w)]]for samp in res3]
stop_list = stopwords.words('english')
res5 = [[samp[0], [w for w in samp[1] if w not in stop_list]]for samp in res4]
stemmer = PorterStemmer()
res6 = [[samp[0], [stemmer.stem(w) for w in samp[1]]]for samp in res5]
return res6
res1: 观察语料,注意到其中包含使用'-'以及'\'的分词方式。为了后续处理方便,这里将它们统一替换为空格
res2: 使用nltk的word_tokenize进行分词
res3: 英文小写化
res4: 删去英文以外的字符(这里假设数据对文本相似度的影响力可以忽略,对数字也进行了删除)
res5: 删去stop words
res6: stemming
2. 语料向量化
2.1. 创建字典
import gensim
from gensim import corpora
def make_dictionary(res6):
dic_field = [samp[1] for samp in res6]
dictionary = corpora.Dictionary(dic_field)
return dictionary
2.2. 向量化并添加标签
def list_to_vec(res6, dictionary):
label = [samp[0] for samp in res6]
dic_field = [samp[1] for samp in res6]
all_tf_vectors = [dictionary.doc2bow(doc) for doc in dic_field]
all_data_as_dict = [{id:1 for (id, tf_value) in vec} for vec in all_tf_vectors]
all_labeled_data = list(zip(all_data_as_dict, label))
return all_labeled_data
3. 创建分类器
import nltk
def classifying(all_labeled_data):
classifier = nltk.NaiveBayesClassifier.train(all_labeled_data)
return classifier
4. 计算准确率
def calculate_acc(classifier, all_test_labeled_data):
return nltk.classify.accuracy(classifier, all_test_labeled_data)
5. main
train = csv_to_list('train.csv')
clean_train = clean_list(train)
dictionary = make_dictionary(clean_train)
all_labeled_data = list_to_vec(clean_train, dictionary)
classifier = classifying(all_labeled_data)
test = csv_to_list('test.csv')
clean_test = clean_list(test)
all_test_labeled_data = list_to_vec(clean_test, dictionary)
accuracy = calculate_acc(classifier, all_test_labeled_data)
print (accuracy)