用机器学习判定红楼梦后40回是否曹雪芹所写-CSDN博客

本文链接：https://blog.csdn.net/u013591119/article/details/84134557

本文通过分词统计《红楼梦》中高频词汇，使用SVM算法预测章节归属，实现对古典文学的现代数据分析。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

思路来源于此，注明出处：

https://mp.weixin.qq.com/s/5qqkHFVPhKlWcewCzSX5HQ

文章里面说的已经很清楚了，就是我们对红楼梦这本书每一章分词，统计出现次数（对一些虚词进行过滤，过滤算法和上篇的数据分析：当回音哥唱music时，他在唱些什么~~~一样）

首先展示一下我们从网上找到的红楼梦素材：

由于有页码的标注，还有一些‘------’和分节阅读字段的干扰，我们在做分词的时候要对这些东西处理，话不多说，先上代码：

# coding=UTF-8

import re
import jieba.posseg as psg
from collections import Counter
import xlwt

def check_is_title(line):
   if(re.match('第[\u4e00-\u9fa5\u767e\u5343\u96f6]{1,10}回', line) != None):
       return True
   return False

def check_is_sundry(line):
    if (line.find('page') != -1 or line.find('------------') != -1 or line.find('分节阅读') != -1):
        return True
    return False

def check_word_characteristic(word_flag):
    if(word_flag.find('r') != -1 or word_flag.find('p') != -1 or word_flag.find('c') != -1 or word_flag.find('u') != -1):
        return False
    return True

def init_excel():
    for index, row in enumerate(frequently_used_words):
        sheet.write(index + 1, 0, row)
    for index in range(120):
        sheet.write(0, index + 1, index + 1)
    for i, row in enumerate(frequently_used_words):
        for j in range(120):
            sheet.write(i + 1, j + 1, 0)

def save_date(date, index):
    for item in date:
        sheet.write(frequently_used_words.index(item[0]) + 1, index + 1, item[1])

frequently_used_words = ['道', '说', '是', '也', '又', '人', '来', '去', '不', '便', '有', '笑', '都', '就', '叫', '呢', '见', '一', '听', '要', '儿', '好', '还', '只', '个', '一个', '上', '到', '才', '问', '倒', '们', '罢', '看', '袭', '忙', '事', '说道', '姑娘', '知道', '吃', '再', '如今', '拿', '起来', '大', '些', '出来', '太太', '将', '众人', '一面', '奶奶', '没', '做', '没有', '想', '里', '只见', '请', '话', '听见', '走', '两个', '不知', '家', '就是', '进来', '二', '平儿', '时', '坐', '东西', '告诉', '回', '回来', '已', '丫头', '老爷', '下', '却', '中', '只得', '大家', '子', '带', '打', '不敢', '送', '先', '命', '小', '多', '可', '出去', '瞧', '不好', '姐姐', '死', '出', '起', '心里', '过来', '鸳鸯', '二爷', '哭', '屋里', '一时', '不能', '找', '竟', '今日', '答应', '湘云', '几个', '银子', '住', '更', '正', '说话', '心', '还有', '晴雯', '无', '钱', '三', '病', '快', '知', '怕', '外头', '字', '茶', '一回', '头', '自然', '打发', '睡', '后', '问道', '看见', '人家', '么', '妹妹', '早', '内', '闹', '媳妇', '不用', '今儿', '忽', '气', '站', '拉', '方', '会子', '罢了', '皆', '作', '别', '不得', '让', '听说', '原来', '到底', '一声', '里头', '家里', '老', '一句', '回去', '连忙', '使', '喝', '众', '完', '前', '日', '进去', '很', '婆子', '已经', '放', '一件', '过去', '两', '麝月', '香菱', '起身', '写', '方才', '身上', '太', '哥哥', '心中', '跑', '如', '小厮', '越发', '进', '四', '劝', '喜欢', '骂', '今', '瞧瞧', '手', '谁知', '果然', '玉', '官', '原', '夫人', '一日', '难道', '跟前', '岂', '不必', '酒', '弄', '遂', '丫鬟', '会', '明儿', '象', '吩咐', '时候', '主意', '急', '仍', '不肯', '不成', '月', '接', '母亲', '玩', '亦', '难', '诗', '脸', '未', '好些', '一处', '上来', '待', '忘', '明日', '提', '姐儿', '开', '衣裳', '散', '看着', '偏', '放心', '跟着', '敢', '吗', '不见', '花', '想着', '不觉', '素日', '应', '毕', '疼', '茗', '多少', '娘', '坐下', '红', '依', '想起', '门', '书', '外', '香', '正说', '点头', '真']

with open('file/红楼梦.txt','r') as f:
    is_one_chapter = False
    one_chapter = []
    # all_txt = []
    index = 0

    # 实例化一个Workbook()对象(即excel文件)
    wbk = xlwt.Workbook()
    # 新建一个名为Sheet1的excel sheet。此处的cell_overwrite_ok =True是为了能对同一个单元格重复操作。
    sheet = wbk.add_sheet('Sheet1', cell_overwrite_ok=True)

    init_excel()

    for line in f.readlines():
        line = line.strip()
        line = re.sub(r"[0-9\s+\.\!\/_,$%^*()?：;；:-【】+\"\']+|[+——！，;:。？、~@#￥%……&*（）]+", "", line)
        if check_is_title(line):
            is_one_chapter = False
            if(is_one_chapter == False and one_chapter.__len__() != 0):
                c = Counter(one_chapter)
                print(c)
                # print(index)
                save_date(list(c.most_common()), index)
                index = index + 1
                one_chapter = []
            print(line)
            is_one_chapter = True
        elif check_is_sundry(line):
            continue
        else:
            if line is not '':
                for word, flag in psg.cut(line.strip()):
                    if (check_word_characteristic(flag) and word in frequently_used_words):
                        # all_txt.append(temp)
                        one_chapter.append(word)
    #最后一回的数据保存
    c = Counter(one_chapter)
    print(c)
    # print(index)
    save_date(list(c.most_common()), index)

    # c = Counter(all_txt)
    # print(c)
    # frequently_used_words = []
    # for index, item in enumerate(list(c.most_common(280))):
    #     frequently_used_words.append(item[0])
    # print(frequently_used_words)

    wbk.save('红楼梦.xls')

代码的大致思路是，我们先把全文遍历一遍，抽出最常出现的280个词。然后记录这些单词，然后我们分章遍历，记录这些单词在每章的出现次数，生成excel表格，为后边的数据分析充当原始数据。

然后是数据的分析，这里我们基本上用的是链接文章里的思想，用的sklean里的svm，来进行预测，直接上代码：

from sklearn import svm
import numpy
import random

import xlrd

workbook = xlrd.open_workbook('红楼梦.xls')
sheet = workbook.sheet_by_index(0)

indexs_in_front_80 = random.sample(range(1,80),15);
indexs_in_end_40 = random.sample(range(81,120),15);

date_x = []
date_y = []
total = []

for index in indexs_in_front_80:
    date_x.append(sheet.col_values(index, 1, -1))
    date_y.append(0)
for index in indexs_in_end_40:
    date_x.append(sheet.col_values(index, 1, -1))
    date_y.append(1)

for index in range(120):
    total.append(sheet.col_values(index + 1, 1, -1))

date_numpy_x = numpy.array(date_x)
date_numpy_y = numpy.array(date_y)
# print(date_x)

clf = svm.LinearSVC()
clf.fit(date_numpy_x,date_numpy_y)

print(clf.predict(total))

error = 0

for index, result in enumerate(clf.predict(total)):
    if index < 80 and result == 1:
        error = error + 1
    if index >= 80 and result == 0:
        error = error + 1
print('错误率为{}'.format(float(error / 100)))

这里大致流程是把上边生成的excel文件读取，从前80章和后40章各取15章，做训练，然后对全部120章做预测，让机器去判断他属于前80章还是后40章。