弹幕词频统计、停用词过滤、生成云图、基于TD-TDF方法特征词分析

最新推荐文章于 2023-06-23 08:35:07 发布

Damon Tong

最新推荐文章于 2023-06-23 08:35:07 发布

阅读量767

点赞数

分类专栏：现代程设

本文链接：https://blog.csdn.net/qq_46658082/article/details/126803139

版权

python pandas 开发语言

现代程设专栏收录该内容

13 篇文章 2 订阅

订阅专栏

总代码

测试结果部分

函数解释部分

总代码

from cmath import log
from collections import defaultdict
import operator  #
from tkinter import Image
import jieba  # 分词器
import wordcloud  # 词云库
import pandas as pd  # 数据处理库
import numpy as np  # 数据分析库
import csv
import random
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image


def test():  # 由于数据量过于庞大生成测试文件：100条停用词和10000条弹幕
    op = open("D:\\经管大三\\现代程序设计\\week2\\stopwords_list.txt", "r", encoding="utf-8")
    f = open("D:\\经管大三\\现代程序设计\\week2\\弹幕数据\\danmuku.csv", encoding="utf-8")
    new_s = open("D:\\经管大三\\现代程序设计\\week2\\new_s.txt", "w", encoding="utf-8")
    file = f.readlines()  # 生成一个列表
    stop = op.readlines()
    new_s.writelines(stop[0:100])#截取前一百个停用词
    column = file[0].strip().split(",")#截取第一行作为header
    # print(file[1].strip().split(","))
    row = []  # 字定义数组行
    # 下面这步操作是因为dataframe读取时的数据格式为[[1,2,3],[1,2,3]]
    for i in file[1:10001]:  # 包含了10000行数据
        i = i.strip().split(",")
        if len(i) != 3:
            continue  # 这一步是为了防止在截取时出现错误，否则可能出现列数不匹配的问题
        row.append(i)
    new = pd.DataFrame(row, columns=column)
    new.to_csv("D:\\经管大三\\现代程序设计\\week2\\test_danmu.csv")
    f.close()
    op.close()
    new_s.close()
    path1 = "D:\\经管大三\\现代程序设计\\week2\\new_s.txt"  # 停用词测试组路径
    path2 = "D:\\经管大三\\现代程序设计\\week2\\test_danmu.csv"  # 弹幕组测试路径
    return path1, path2


'''
    可以写为
    f=open(filepath,"r").readlines()读取所有行并且生成一个列表
    stopword=[]
    for line in f:
        stopword+=line.strip()即去除每行首尾的空白字符
'''


def Stopwordlist(path1):  # 参数为停用词的地址
    stopword = [line.strip() for line in open(path1, 'r', encoding="utf-8").readlines()]  # 以行的形式读取停用词表，同时转换为列表
    return stopword


def Read_fen(path1, path2):  # 停用词表需要加入到分词的自定义字典中，防止在给弹幕文档分词时出现错误,path1为停用词的地址，path2为弹幕地址
    jieba.load_userdict(path1)  # 按行自定义分词词典
    document = pd.read_csv(path2, encoding="utf-8")  # 经过读取得到第一列到第三列名称为content\second\sent,此时返回的是一个dataframe
    doc = document["content"]  # 获取一列数据，此时是按行输出
    result = []  # 保存分词结果
    for i in doc:
        doc_1 = jieba.lcut(i, cut_all=False)  # 对每一行进行分词,采用精确分词的形式，此时按照分词词典中的方式来
        result += doc_1  # 将每行的分词结果存入result

    return result


'''
或者采用
document=pd.read_csv(filepath)
doc=documenr["content"]
result=jieba.lcut(document.strip())
'''


def Filter_sat(stopword, result):  # 过滤停用词并且进行词频统计,0为停用词，1为分词结果
    newlist = []
    for word in result:
        if word in stopword:
            continue
        newlist.append(word)
    counts = {}  # 记录词频的字典
    for word in newlist:
        counts[word] = counts.get(word, 0) + 1  # 如果word在counts中，则返回其值+1，否则默认为0+1
    items = list(counts.items())  # 由于字典类型没有顺序，将其转化为有顺序的列表类型
    items.sort(key=lambda x: x[1], reverse=True)  # 按照每个列表元素的第二个数，即value进行排序，且为降序
    for i in range(0, 5, 1):  # 输出5个高频词
        key = items[i][0]
        count = items[i][1]
        print("{0:<10}{1:>7}".format(key, count))
    for i in range(-1, -6, -1):  # 输出5个低频词
        key = items[i][0]
        count = items[i][1]
        print("{0:<10}{1:>7}".format(key, count))
    return newlist, items  # 返回过滤后的词，以及经过排序的列表 newlist为过滤后的列表（未经过统计）,items为经过排序的字典样式列表


def Gaopin(items, high):  # 高频词汇，high为高频词的边界条件,生成特征集
    gaopin = []
    for i in items:
        if i[1] >= high:
            gaopin.append(i[0])
        else:
            break  # 当小于5000就不算高频词,则直接跳出
    return gaopin


def One_hot(gaopin, path2):  # 生成一个弹幕向量集,注意使用path2，表示弹幕路径
    f = open(path2, "r", encoding="utf-8")
    length = len(gaopin)  # 表示生成向量的维数
    file = f.readlines()
    file = [i.strip() for i in file]  # 生成一个没有换行符的字符串列表
    matrix = []  # 用来保存字符串向量的矩阵
    for item in file:
        ver = [0 for i in range(length)]  # 生成一行初始化向量
        for j in range(length):
            ver[j] = item.count(gaopin[j])  # 记录某一个高频词在字符串中出现的次数
        matrix.append(ver)  # 生成矩阵，行数为弹幕数量
    return matrix


def differ(matrix,path2):  # 判断各弹幕之间的差异(随机选取2行进行比较)
    number = random.sample(range(len(matrix)), 2)  # 随机取两行,返回的是两行的标号
    f = open(path2, "r", encoding="utf-8")
    file=f.readlines()
    file = [i.strip() for i in file]
    a = np.array(matrix[number[0]])
    b = np.array(matrix[number[1]])
    dis = np.sqrt(np.sum(np.square(a - b)))
    print("分别抽取了第{0:}条和第{1:}条弹幕向量进行比较：".format(number[0]-1, number[1]-1))
    print(file[number[0]])
    print(file[number[1]])
    print(matrix[number[0]])
    print(matrix[number[1]])
    print("distance={}".format(dis))


def wd(newlist, gaopin):
    new = []
    for i in newlist:  # 只保留高频词的列表
        if i in gaopin:
            new.append(i)
    new = " ".join(new)  # 首先需要把分词列表连接起来，云图制作需要使用字符串，并且以空格隔开方便统计
    img = Image.open("D:\\经管大三\\现代程序设计\\week2\\danmu.jpg")
    im = np.array(img)  # 将图片数组化
    wc = WordCloud(
        font_path="msyh.ttc",  # 如果添加字体的格式，会出现词云显示为方框
        mask=im,
        width=1000,
        height=700,
        background_color='white').generate(new)
    plt.imshow(wc, interpolation='bilinear')  # 用plt显示图片
    plt.show()  # 展示图片
    wc.to_file("D:\\经管大三\\现代程序设计\\week2\\wordcloud2.png")  # 保存云图


# 这部分是采取了字典生成词云的方法
def wd_dic(items):  # items为类似字典的词频列表
    items = dict(items[0:50])  # 提取前50个高频词
    wc = WordCloud(
        font_path="msyh.ttc",
        background_color="white",  # 设置背景为白色，默认为黑色
        width=1500,  # 设置图片的宽度
        height=960,  # 设置图片的高度
        margin=10  # 设置图片的边缘
    )
    wc.generate_from_frequencies(items)  # 从字典生成词云
    plt.imshow(wc)  # 显示词云
    plt.axis('off')  # 关闭坐标轴
    plt.show()  # 显示图像
    wc.to_file("D:\\经管大三\\现代程序设计\\week2\\wordcloud2.png")  # 保存图片


def TD_IDF(items, path2):  # 传入参数为词频统计表
    total = 0
    for i in items:
        total += i[1]  # 计算总词数
    word_tf = {}  # 存储每个词的tf值
    for i in items:
        word_tf[i[0]] = i[1] / total
    f = open(path2, "r", encoding="utf-8")
    file = f.readlines()
    file = [i.strip() for i in file]  # 生成一个没有换行符的字符串列表
    length = len(file)  # 记录总文件数
    word_idf = {}
    word_doc = defaultdict(int)  # 存储包含该词的文档数
    for i in items:  # 统计每个词出现在所有弹幕中的次数
        for j in file:
            if i[0] in j:
                word_doc[i[0]] += 1  # 该方法被用于计数,来源于collection库
    for i in items:
        word_idf[i[0]] = log(length / (word_doc[i[0]] + 1))  # 避免分母为0的情况
    # 计算每个词的TF*IDF的值
    word_tf_idf = {}
    for i in items:
        word_tf_idf[i[0]] = word_tf[i[0]] * word_idf[i[0]].real  # 我们保留实数部分作为比较
    # 对字典按值由大到小排序
    dict_feature_select = sorted(word_tf_idf.items(), key=operator.itemgetter(1), reverse=True)
    return dict_feature_select


def main():
    path1,path2=test()
    #path1 = "D:\\经管大三\\现代程序设计\\week2\\stopwords_list.txt"  # 停用词
    #path2 = "D:\\经管大三\\现代程序设计\\week2\\弹幕数据\\danmuku.csv"  # 弹幕
    result = Read_fen(path1, path2)
    stopword = Stopwordlist(path1)
    newlist, items = Filter_sat(stopword, result)
    gaopin = Gaopin(items, 600)
    print(gaopin)
    matrix = One_hot(gaopin, path2)
    print(matrix)
    differ(matrix,path2)
    #wd(newlist, gaopin)
    wd_dic(items)
    feature = TD_IDF(items, path2)  # 为了避免不必要的消耗，排在1000以后的词认为不需要再做特征词的讨论
    print(feature)


main()

测试结果部分

输出的是前五个高频词和后五个低频次，以及频率超过600的词作为高频词

下面为依据特征词生成的没条弹幕构成的特征词向量矩阵的部分截图

比较特征词向量，distance为欧氏距离

生词词云，根据高频词来生成

以下是TD-ITD的测试部分结果，可以得到每个高频词的重要程度

函数解释部分

调用的库

from cmath import log
from collections import defaultdict
import operator  #
from tkinter import Image
import this as d
import jieba  # 分词器
import wordcloud  # 词云库
import pandas as pd  # 数据处理库
import numpy as np  # 数据分析库
import csv
import random
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image

测试函数

由于题目所给的文件数据量过于庞大，导致每一次测试都要花费大量的时间，因此我将生成两个测试文件，截取部分弹幕和停用词

def test():  # 由于数据量过于庞大生成测试文件：100条停用词和10000条弹幕
    op = open("D:\\经管大三\\现代程序设计\\week2\\stopwords_list.txt", "r", encoding="utf-8")
    f = open("D:\\经管大三\\现代程序设计\\week2\\弹幕数据\\danmuku.csv", encoding="utf-8")
    new_s = open("D:\\经管大三\\现代程序设计\\week2\\new_s.txt", "w", encoding="utf-8")
    file = f.readlines()  # 生成一个列表
    stop = op.readlines()
    new_s.writelines(stop[0:100])#截取前一百个停用词
    column = file[0].strip().split(",")#截取第一行作为header
    # print(file[1].strip().split(","))
    row = []  # 字定义数组行
    # 下面这步操作是因为dataframe读取时的数据格式为[[1,2,3],[1,2,3]]
    for i in file[1:10001]:  # 包含了10000行数据
        i = i.strip().split(",")
        if len(i) != 3:
            continue  # 这一步是为了防止在截取时出现错误，否则可能出现列数不匹配的问题
        row.append(i)
    new = pd.DataFrame(row, columns=column)
    new.to_csv("D:\\经管大三\\现代程序设计\\week2\\test_danmu.csv")
    f.close()
    op.close()
    new_s.close()
    path1 = "D:\\经管大三\\现代程序设计\\week2\\new_s.txt"  # 停用词测试组路径
    path2 = "D:\\经管大三\\现代程序设计\\week2\\test_danmu.csv"  # 弹幕组测试路径
    return path1, path2

生成停用词列表

def Stopwordlist(path1):  # 参数为停用词的地址
    stopword = [line.strip() for line in open(path1, 'r', encoding="utf-8").readlines()]  # 以行的形式读取停用词表，同时转换为列表
    return stopword

将弹幕根据停用词分词并且生成列表

def Read_fen(path1, path2):  # 停用词表需要加入到分词的自定义字典中，防止在给弹幕文档分词时出现错误,path1为停用词的地址，path2为弹幕地址
    jieba.load_userdict(path1)  # 按行自定义分词词典
    document = pd.read_csv(path2, encoding="utf-8")  # 经过读取得到第一列到第三列名称为content\second\sent,此时返回的是一个dataframe
    doc = document["content"]  # 获取一列数据，此时是按行输出
    result = []  # 保存分词结果
    for i in doc:
        doc_1 = jieba.lcut(i, cut_all=False)  # 对每一行进行分词,采用精确分词的形式，此时按照分词词典中的方式来
        result += doc_1  # 将每行的分词结果存入result
    return result

过滤停用词并且统计词频

def Filter_sat(stopword, result):  # 过滤停用词并且进行词频统计,0为停用词，1为分词结果
    newlist = []
    for word in result:
        if word in stopword:
            continue
        newlist.append(word)
    counts = {}  # 记录词频的字典
    for word in newlist:
        counts[word] = counts.get(word, 0) + 1  # 如果word在counts中，则返回其值+1，否则默认为0+1
    items = list(counts.items())  # 由于字典类型没有顺序，将其转化为有顺序的列表类型
    items.sort(key=lambda x: x[1], reverse=True)  # 按照每个列表元素的第二个数，即value进行排序，且为降序
    for i in range(0, 5, 1):  # 输出5个高频词
        key = items[i][0]
        count = items[i][1]
        print("{0:<10}{1:>7}".format(key, count))
    for i in range(-1, -6, -1):  # 输出5个低频词
        key = items[i][0]
        count = items[i][1]
        print("{0:<10}{1:>7}".format(key, count))
    return newlist, items  # 返回过滤后的词，以及经过排序的列表 newlist为过滤后的列表（未经过统计）,items为经过排序的字典样式列表

生成高频词列表

def Gaopin(items, high):  # 高频词汇，high为高频词的边界条件,生成特征集
    gaopin = []
    for i in items:
        if i[1] >= high:
            gaopin.append(i[0])
        else:
            break  # 当小于5000就不算高频词,则直接跳出
    return gaopin

根据高频词生成弹幕向量矩阵

def One_hot(gaopin, path2):  # 生成一个弹幕向量集,注意使用path2，表示弹幕路径
    f = open(path2, "r", encoding="utf-8")
    length = len(gaopin)  # 表示生成向量的维数
    file = f.readlines()
    file = [i.strip() for i in file]  # 生成一个没有换行符的字符串列表
    matrix = []  # 用来保存字符串向量的矩阵
    for item in file:
        ver = [0 for i in range(length)]  # 生成一行初始化向量
        for j in range(length):
            ver[j] = item.count(gaopin[j])  # 记录某一个高频词在字符串中出现的次数
        matrix.append(ver)  # 生成矩阵，行数为弹幕数量
    return matrix

随机比较两条弹幕

def differ(matrix,path2):  # 判断各弹幕之间的差异(随机选取2行进行比较)
    number = random.sample(range(len(matrix)), 2)  # 随机取两行,返回的是两行的标号
    f = open(path2, "r", encoding="utf-8")
    file=f.readlines()
    file = [i.strip() for i in file]
    a = np.array(matrix[number[0]])
    b = np.array(matrix[number[1]])
    dis = np.sqrt(np.sum(np.square(a - b)))
    print("分别抽取了第{0:}条和第{1:}条弹幕向量进行比较：".format(number[0]-1, number[1]-1))
    print(file[number[0]])
    print(file[number[1]])
    print(matrix[number[0]])
    print(matrix[number[1]])
    print("distance={}".format(dis))

两种生成词云的方式：列表、字典

def wd(newlist, gaopin):
    new = []
    for i in newlist:  # 只保留高频词的列表
        if i in gaopin:
            new.append(i)
    new = " ".join(new)  # 首先需要把分词列表连接起来，云图制作需要使用字符串，并且以空格隔开方便统计
    img = Image.open("D:\\经管大三\\现代程序设计\\week2\\danmu.jpg")
    im = np.array(img)  # 将图片数组化
    wc = WordCloud(
        font_path="msyh.ttc",  # 如果添加字体的格式，会出现词云显示为方框
        mask=im,
        width=1000,
        height=700,
        background_color='white').generate(new)
    plt.imshow(wc, interpolation='bilinear')  # 用plt显示图片
    plt.show()  # 展示图片
    wc.to_file("D:\\经管大三\\现代程序设计\\week2\\wordcloud2.png")  # 保存云图


# 这部分是采取了字典生成词云的方法
def wd_dic(items):  # items为类似字典的词频列表
    items = dict(items[0:50])  # 提取前50个高频词
    wc = WordCloud(
        font_path="msyh.ttc",
        background_color="white",  # 设置背景为白色，默认为黑色
        width=1500,  # 设置图片的宽度
        height=960,  # 设置图片的高度
        margin=10  # 设置图片的边缘
    )
    wc.generate_from_frequencies(items)  # 从字典生成词云
    plt.imshow(wc)  # 显示词云
    plt.axis('off')  # 关闭坐标轴
    plt.show()  # 显示图像
    wc.to_file("D:\\经管大三\\现代程序设计\\week2\\wordcloud2.png")  # 保存图片

使用TD-TDF方法做特征词分析

def TD_IDF(items, path2):  # 传入参数为词频统计表
    total = 0
    for i in items:
        total += i[1]  # 计算总词数
    word_tf = {}  # 存储每个词的tf值
    for i in items:
        word_tf[i[0]] = i[1] / total
    f = open(path2, "r", encoding="utf-8")
    file = f.readlines()
    file = [i.strip() for i in file]  # 生成一个没有换行符的字符串列表
    length = len(file)  # 记录总文件数
    word_idf = {}
    word_doc = defaultdict(int)  # 存储包含该词的文档数
    for i in items:  # 统计每个词出现在所有弹幕中的次数
        for j in file:
            if i[0] in j:
                word_doc[i[0]] += 1  # 该方法被用于计数,来源于collection库
    for i in items:
        word_idf[i[0]] = log(length / (word_doc[i[0]] + 1))  # 避免分母为0的情况
    # 计算每个词的TF*IDF的值
    word_tf_idf = {}
    for i in items:
        word_tf_idf[i[0]] = word_tf[i[0]] * word_idf[i[0]].real  # 我们保留实数部分作为比较
    # 对字典按值由大到小排序
    dict_feature_select = sorted(word_tf_idf.items(), key=operator.itemgetter(1), reverse=True)
    return dict_feature_select

主函数调用

def main():
    path1,path2=test()
    #path1 = "D:\\经管大三\\现代程序设计\\week2\\stopwords_list.txt"  # 停用词
    #path2 = "D:\\经管大三\\现代程序设计\\week2\\弹幕数据\\danmuku.csv"  # 弹幕
    result = Read_fen(path1, path2)
    stopword = Stopwordlist(path1)
    newlist, items = Filter_sat(stopword, result)
    gaopin = Gaopin(items, 600)
    print(gaopin)
    matrix = One_hot(gaopin, path2)
    print(matrix)
    differ(matrix,path2)
    #wd(newlist, gaopin)
    wd_dic(items)
    feature = TD_IDF(items, path2)  # 为了避免不必要的消耗，排在1000以后的词认为不需要再做特征词的讨论
    print(feature)

主函数调用改进，当我们采用非测试路径，即上百万条弹幕时，我发现调用该主函数导致了窗口崩溃，因此只打印少部分，且需要修改gaopin函数里的参数

def main():
    #path1,path2=test()
    path1 = "D:\\经管大三\\现代程序设计\\week2\\stopwords_list.txt"  # 停用词
    path2 = "D:\\经管大三\\现代程序设计\\week2\\弹幕数据\\danmuku.csv"  # 弹幕
    result = Read_fen(path1, path2)
    stopword = Stopwordlist(path1)
    newlist, items = Filter_sat(stopword, result)
    gaopin = Gaopin(items, 8000)
    print(gaopin)
    matrix = One_hot(gaopin, path2)
    differ(matrix,path2)
    #wd(newlist, gaopin)
    wd_dic(items)
    feature = TD_IDF(items, path2)  # 为了避免不必要的消耗，排在1000以后的词认为不需要再做特征词的讨论
    print(feature[0:20])