SVM文本分类-在《红楼梦》作者鉴别的应用上（python实现）

最新推荐文章于 2023-01-26 13:22:01 发布

HZYXN

最新推荐文章于 2023-01-26 13:22:01 发布

阅读量2.4k

点赞数 1

文章标签：机器学习 python

本文链接：https://blog.csdn.net/qq_43012399/article/details/103230934

版权

在学界一般认为，《红楼梦》后 40 回并非曹雪芹所著。利用机器学习相关算法来进行判断
原理
每个作者写作都有自己的用词习惯和风格，即使是故意模仿也会留下很多痕迹。
在文言文中，文言虚词分布均匀，书中每个回目都会出现很多文言虚词，差别在于出现频率不同，我们把文言虚词的出现频率作为特征。
不只文言虚词，还有其他的词在所有回目中出现频率很多。比如对第 80 回进行词频统计，得到

了 172
的 142
我 70
宝玉 65
你 61
道 54
他 51
这些高频词汇也可以作为特征向量。

本文将 20~29 回(诗词曲比较均衡)作为类别 1 的学习样本，将 110~119 回作为类别 2 的学习样本。
将两个类别的特征向量输入到 SVM(支持向量机) 进行训练得出一个分类模型。再对剩余回目进行分类，看它们分别偏向于哪个类别。
分词
拿到文本数据后，先进行回合划分。然后就是去标点符号、分词，做词频统计。

#!/usr/bin/python
# -*- coding:utf-8 -*-

import string
import jieba
import sys
import re



class textProcesser(object):
	def __init__(self):
		pass

	# 将书分为章节
	def divide_into_chapter(self):
		file_in = open('text/redmansions.txt', 'r')
		line = file_in.readline()
		chapter_cnt = 1
		chapter_text = ""

		while line:
			if '[(' in line:
				path_str = 'text/chapter-' + str(chapter_cnt)
				file_out = open(path_str, 'a')
				file_out.write(chapter_text)
				chapter_cnt += 1
				file_out.close()
				chapter_text = line
			else:
				chapter_text += line

			line = file_in.readline()

		file_in.close

	# 对一章分词
	def divide_into_words(self, document, docID):
		path_str = 'text/chapter-words-' + str(docID)
		file_out = open(path_str,'a')

		line = document.readline()
		while(line):
			seg_list = jieba.cut(line, cut_all=False)
			words = " ".join(seg_list)
			file_out.write(words)
			line = document.readline()
		file_out.close()

	# 对所有章节分词
	def perform_segmentation(self):
		for loop in range(1, 121):
			path_str = 'text/chapter-' + str(loop)
			file_in = open(path_str, 'r')
			self.divide_into_words(file_in, loop)

	# 将每个文档去除标点后，再进行词频统计
	def count_words(self, document, docID):
		result_dict = {}
		delset = string.punctuation

		line = str(document)
		line = line.translate(None, delset) #去除英文标点
		line = "".join(line.split('\n')) # 去除回车
		line = self.sub_replace(line) #去除中文标点
		word_array = []
		words = line.split()
		for word in words:
			if not result_dict.has_key(word):
				result_dict[word] = 1
			else:
				result_dict[word] += 1

		path_str = 'text/chapter-wordcount-' + str(docID)
		file_out = open(path_str,'a')

		# 排序后写入文本
		sorted_result = sorted(result_dict.iteritems(), key=lambda d:d[1], reverse = True)
		for one in sorted_result:
			line = "".join(one[0] + '\t' + str(one[1]) + '\n')
			file_out.write(line)
		file_out.close()

	# 对所有文档进行分词
	def perform_wordcount(self):
		for loop in range(1, 121):
			path_str = 'text/chapter-words-' + str(loop)
			file_in = open(path_str, 'r')
			line = file_in.readline()
			document = ""
			while line:
				document += line
				line = file_in.readline()
			self.count_words(document, loop)
			file_in.close()

	def sub_replace(self, line):
		regex = re.compile("[^\u4e00-\u9fa5a-zA-Z0-9\s]")
		return regex.sub('', line.decode('utf-8'))

在这里插入图片描述

特征选取

[
‘之’, ‘其’, ‘或’, ‘亦’, ‘方’, ‘于’, ‘即’, ‘皆’, ‘因’, ‘仍’,
‘故’, ‘尚’, ‘呢’, ‘了’, ‘的’, ‘着’, ‘一’, ‘不’, ‘乃’, ‘呀’,
‘吗’, ‘咧’, ‘啊’, ‘把’, ‘让’, ‘向’, ‘往’, ‘是’, ‘在’, ‘越’,
‘再’, ‘更’, ‘比’, ‘很’, ‘偏’, ‘别’, ‘好’, ‘可’, ‘便’, ‘就’,
‘但’, ‘儿’, # 42 个文言虚词
‘又’, ‘也’, ‘都’, ‘要’, # 高频副词
‘这’, ‘那’, ‘你’, ‘我’, ‘他’ # 高频代词
‘来’, ‘去’, ‘道’, ‘笑’, ‘说’ #高频动词
]
选取常用的 42 个文言虚词和通过词频统计得到的高频使用的词作为特征，分别计算它们在各个回目中出现的频率作为特征向量。
代码

class modelBuilder(object):
	def __init__(self):
		pass

	def get_wordnum_of_chapter(self, DocID):
		path_str = 'text/chapter-' + str(DocID)
		file_in = open(path_str)

		text = ""
		for line in file_in:
			text += "".join(line.split('\n')) # 去除回车
		file_in.close

		num = len(text.decode("gb18030"))
		return num

	# 每个文档提取特征向量
	def build_feature_vector(self, DocID, label):
		path_str = 'text/chapter-wordcount-' + str(DocID)
		
		# function_word_list = ['之', '其', '或', '亦', '方', '于', '即', '皆', '因', '仍', 
		# 					  '故', '尚', '呢', '了', '的', '着', '不', '乃', '呀', 
		# 					  '吗', '咧', '啊', '把', '让', '向', '往', '是', '在', '越', 
		# 					  '再', '更', '比', '很', '偏', '别', '好', '可', '便', '就',
		# 					  '但', '儿', # 42 个文言虚词
		# 					  '又', '也', # 高频副词
		# 					  '这', '那', '你', '我', '他' #高频代词
		# 					  '来', '去', '道', '笑'] #高频动词

		function_word_list = ['之', '其', '或', '亦', '方', '于', '即', '皆', '因', '仍', 
							  '故', '尚', '呢', '了', '的', '着', '一', '不', '乃', '呀', 
							  '吗', '咧', '啊', '把', '让', '向', '往', '是', '在', '越', 
							  '再', '更', '比', '很', '偏', '别', '好', '可', '便', '就',
							  '但', '儿',                 # 42 个文言虚词
							  '又', '也', '都', '要',      # 高频副词
							  '这', '那', '你', '我', '他' # 高频代词
							  '来', '去', '道', '笑', '说' #高频动词
							  ] 
		feature_vector_list = []

		for function_word in function_word_list:
			
			find_flag = 0
			file_in = open(path_str) #每次打开移动 cursor 到头部
			line = file_in.readline()
			while line:
				words = line[:-1].split('\t')
				if words[0] == function_word:
					total_words = self.get_wordnum_of_chapter(DocID)
					rate = float(words[1]) / total_words * 1000
					rate = float("%.6f" % rate)# 指定位数
					feature_vector_list.append(rate)
					# print words[0] + ' : ' + line

					file_in.close()
					find_flag = 1
					break
				line = file_in.readline()

			# 未找到词时向量为 0
			if not find_flag:
				feature_vector_list.append(0) 

		feature_vector_list.append(label)
		return feature_vector_list

	def make_positive_trainset(self):
		positive_trainset_list = []
		for loop in range(20, 30):
			feature = self.build_feature_vector(loop, 1) #label 为 1 表示正例
			positive_trainset_list.append(feature)
		# print positive_trainset_list
		np.save('pos_trainset.npy', positive_trainset_list)

	def make_negative_trainset(self):
		negative_trainset_list = []
		for loop in range(110, 120):
			feature = self.build_feature_vector(loop, 2) #label 为 0 表示负例
			negative_trainset_list.append(feature)
		# print negative_trainset_list
		np.save('neg_trainset.npy', negative_trainset_list)

	def make_trainset(self):
		feature_pos = np.load('pos_trainset.npy')
		feature_neg = np.load('neg_trainset.npy')
		trainset = np.vstack((feature_pos, feature_neg))
		np.save('trainset.npy', trainset)

	def make_testset(self):
		testset_list = []
		for loop in range(1, 121):
			feature = self.build_feature_vector(loop, 0) #无需 label，暂设为 0
			testset_list.append(feature)
		# print testset_list
		np.save('testset.npy', testset_list)

特征向量含义
在这里插入图片描述
下面利用svm训练

# -*- coding: utf-8 -*-

import numpy as np
from sklearn.naive_bayes import MultinomialNB
import get_trainset as ts
x_train = ts.get_train_set().get_all_vector()



class result:
    def __inti__(self):
        pass
    
    def have_Xtrainset(self):
        Xtrainset = x_train
        Xtrainset = np.vstack((Xtrainset[19:29],Xtrainset[109:119]))
        return(Xtrainset)   
    
    def as_num(self,x):
        y='{:.10f}'.format(x)
        return(y)
    
    def built_model(self):
        x_trainset = self.have_Xtrainset()
        y_classset = np.repeat(np.array([1,2]),[10,10])
        
        NBclf = MultinomialNB()
        NBclf.fit(x_trainset,y_classset) # 建立模型
        
        all_vector = x_train
        
        result = NBclf.predict(all_vector)
        print('前'+str(len(result[0:80]))+'回分类结果为：')
        print(result[0:80])
        print('后'+str(len(result[80:121]))+'回分类结果为：')
        print(result[80:121])
       
        diff_chapter = [80,81,83,84,87,88,90,100]
        for i in diff_chapter:
            tempr = NBclf.predict_proba(all_vector[i])
            print('第'+str(i+1)+'回的分类概率为： ')
            print(str(self.as_num(tempr[0][0]))+' '+str(self.as_num(tempr[0][1])))
    res = result()
    res.built_model()

结果如下
在这里插入图片描述
1 指该回目属于类别 1，2 指该回目属于类别 2。

可以得出结论

前 80 回属于一类，后 40 回属于一类
80 回左右是分界点
后 40 回风格不同于前 80 回

HZYXN

关注

1
点赞
踩
35

收藏

觉得还不错? 一键收藏
4
评论
SVM文本分类-在《红楼梦》作者鉴别的应用上（python实现）

在学界一般认为，《红楼梦》后 40 回并非曹雪芹所著。利用机器学习相关算法来进行判断原理每个作者写作都有自己的用词习惯和风格，即使是故意模仿也会留下很多痕迹。在文言文中，文言虚词分布均匀，书中每个回目都会出现很多文言虚词，差别在于出现频率不同，我们把文言虚词的出现频率作为特征。不只文言虚词，还有其他的词在所有回目中出现频率很多。比如对第 80 回进行词频统计，得到了 172的 ...
复制链接

扫一扫