python 共现矩阵构建

这是一个死肥宅

已于 2024-07-10 16:18:49 修改

阅读量1.3w

点赞数 21

分类专栏：文本处理文章标签：共词矩阵

于 2019-04-26 19:44:02 首次发布

本文链接：https://blog.csdn.net/qq_28840013/article/details/89575548

版权

文本处理专栏收录该内容

7 篇文章 6 订阅

订阅专栏

1.什么是共现矩阵：
共现矩阵：也成为共词矩阵，能表明两个词之间的关系程度

2.构建过程：
数据准备：
假设有10篇文本，我们将从这10篇文本中，提取每一篇的分词结果，并存入Single_text_list中。再将由10篇文章的关键词列表合为一个列表Full_text_list，

Full_text_list=[ [文章1切词结果]，[文章2切词结果] ...]

构建：
1.对每篇文章作词频统计，选出其排名前100的词及词频（或者全部词频统计结果）
2.对词频统计结果求并集，结果存入一个字典中，keys()为词，values()为每个词的词频。再将所有特征词存入Full_Feature_word列表中，其对应的词频存入Full_Feature_weight列表中。
3.建一个二维矩阵Common_matrix其大小为：总特征词词数x总特征词词数（也就是共词矩阵）。其横竖分别对应总特征词中的每个词，例如矩阵第3行第5列的数值即代表，特征词第3个与特征词第5个的关系程度，同时它的值也等于该矩阵第5行第3列的值。（对，没错，它也是一个对角矩阵）
4.将共词矩阵对角线上元素赋值为它自身在所有文章出现次数。
5.循环遍历特征词列表，构建全部两个词之间的组合,再遍历每一篇文章的切词结果，如果该两个词在同一片文章中出现，则该两词的权重+1，再将其存入共词矩阵的对应位置中。例如特征词第6个和特征词第8个，这两个词的权重为3，则将其权重3存入共词矩阵的第6行第8列和第8行第6列中。

3.重要功能:
在构建好共词矩阵后，我们需要能够获取 总特征词，特征词所对应的共词矩阵。有了这些，我们还希望能够获取一个词与所有特征词的关系列表

4.python代码
在这里我构建了一个共词矩阵的类，该类接收 Full_text_list=[ [文章1切词结果]，[文章2切词结果] …]参数（即每篇文章分词而得到的矩阵），该类可以返回特征词列表，共词矩阵，并定义了一个方法：参数为一个词，方法返回该词对应的共词矩阵的那一行

import os
import jieba
import collections
import numpy as np
from collections import Counter

class Coword_matrix(object):    #定义了一个共词矩阵的类  初始化参数（切词后的嵌套列表 [[文章1切词结果]，[文章2切词结果]...]）
    def __init__(self,Fulltext_cut_content):    #传入切词后的嵌套列表，可以得到self.Fulltext_cut_content，self.Full_Feature_word, self.Common_matrix
        # self.Fulltext_cut_content_str = [" ".join(i) for i in self.Fulltext_cut_content]    也可以将列表转化为字符串，使后面对全文本的遍历更快，for...in... 遍历str比遍历list更快！
        self.Fulltext_cut_content=Fulltext_cut_content
        Full_familiar_Feature = {}  # 储存特征和权重，为dict格式
        for Single_text_feature_list in Fulltext_cut_content:
            Single_text_feature_sort_dict = collections.Counter(Single_text_feature_list)  # 词频统计
            Single_text_feature_sort_result = Single_text_feature_sort_dict.most_common(100)  # 选出词频统计排名前100的，请按需选择，无参表示全部排序结果，会运行比较长的时间
            Full_familiar_Feature = dict(Counter(dict(Single_text_feature_sort_result))+Counter(dict(Full_familiar_Feature)))  # 化为Counter后 作并集，再将结果化为dict
        self.Full_Feature_word,self.Full_Feature_weight= list(Full_familiar_Feature.keys()), list(Full_familiar_Feature.values())

        self.Common_matrix = np.empty((len(self.Full_Feature_word), len(self.Full_Feature_word)))  # 构建共词矩阵，大小为[词数]x[词数]
        for row in range(len(self.Full_Feature_word)):  # 将共词矩阵 对角线上元素=它自身在所有文章出现次数
            for column in range(len(self.Full_Feature_word)):
                if column == row:
                    self.Common_matrix[row][column] = int(self.Full_Feature_weight[row])
        for i in range(len(self.Full_Feature_word)):  # i的范围为 1 到 词数
            for n in range(1, len(self.Full_Feature_word) - i):  # n的范围为 1到（词数-i）   i+n的范围为 i 到 词数
                word1 = self.Full_Feature_word[i]
                word2 = self.Full_Feature_word[i + n]
                Common_weight = 0
                for Single_Text_Cut in self.Fulltext_cut_content:  
                #遍历每一篇文章的切词结果，如果word1和word2在同一片文章中出现，则该两词的权重+1
                #也可以将Fulltext_cut_content化为字符串后进行遍历，这样更快！
                #如果希望统计word1和word2在同一片文章中出现的最小次数，则可以使用 str.count(),或者 list.Counter分别计算次数，再取最小值 赋予权重
                    if ((word1 in Single_Text_Cut) and (word2 in Single_Text_Cut)):
                        Common_weight += 1
                self.Common_matrix[i][i + n] = Common_weight    #该矩阵为对角矩阵
                self.Common_matrix[i + n][i] = Common_weight

    def get_Full_Feature_word(self):    #返回特征词列表
        return self.Full_Feature_word

    def get_Common_matrix(self):    #返回共词矩阵
        return self.Common_matrix

    def return_word_row(self,word):    #定义一个方法 参数为一个词，函数返回 该词对应的共词矩阵的那一行
        if word not in self.Full_Feature_word:
            print(word + "   该词不在特征词中！")
        else:
            for row in range(len(self.Full_Feature_word)):
                if word == self.Full_Feature_word[row]:
                    return self.Common_matrix[row]
'''
self.Fulltext_cut_content   初始化参数（切词后的嵌套列表 [[文章1切词结果]，[文章2切词结果]...]）
Full_familiar_Feature = {}  # 储存特征和权重，为dict格式
self.Full_Feature_word  特征词列表
self.Common_matrix 共词矩阵
'''

如何对初始文本进行处理，转化为Full_text_list=[ [文章1切词结果]，[文章2切词结果] …]，其python代码如下：

def delstopwordslist(classsstr):    #去停用词
    stopwords = [line.strip() for line in open('stop.txt', encoding='UTF-8').readlines()]    #注意停用词文本名称和所在位置
    outstr = ''
    classsstr=classsstr.split(' ')
    for word in classsstr:
        if ((word not in stopwords) and (len(word)>1)):    #去掉停用词和长度小于1的
            outstr += word
            outstr += ' '
    return outstr

def Read_Full_Cut_List(path="测试文本"):
    filenames=os.listdir(path)
    Fulltext_cut_content = []
    for i in filenames:
        Single_text_content = ''
        with open(path+'/'+i,"r",encoding='UTF-8') as f:
            for centence in f.readlines():
                centence = centence.strip().replace(' ', '').replace(' ', '')
                Single_text_content += centence
            text = ' '.join(jieba.cut(Single_text_content))    #分词
        Fulltext_cut_content.append(delstopwordslist(text).split())    #去停用词和长度小于1的
    return Fulltext_cut_content    #每篇文章的分词结果str,再apend到一个Fulltext_cut_content中

测试文本格式如下：
在这里插入图片描述
测试代码：

if __name__=="__main__":
    Fulltext_cut_content=Read_Full_Cut_List("测试文本")
    Coword_one=Coword_matrix(Fulltext_cut_content)	#对该类传入初始化参数
    print('=========================特征词===========================')
    print(Coword_one.get_Full_Feature_word())
    print('========================共词矩阵===========================')
    print(Coword_one.get_Common_matrix())
    print('===============“娱乐”词与所有特征词的关系程度===============')
    print(Coword_one.return_word_row("娱乐"))

结果：
在这里插入图片描述
以上，如果有什么不清楚的地方，可以在评论区说明，将上面代码组合一下，是可以跑通的，可以试一试。

这是一个死肥宅

关注

21
点赞
踩
107

收藏

觉得还不错? 一键收藏
18
评论
python 共现矩阵构建

1.什么是共词矩阵：共词矩阵：共词矩阵能表明两个词之间的关系程度2.构建过程：数据准备：假设有10篇文本，我们将从这10篇文本中，提取每一篇的分词结果，并存入Single_text_list中。再将由10篇文章的关键词列表合为一个列表Full_text_list，Full_text_list=[ [文章1切词结果]，[文章2切词结果] ...]构建：1.对每篇文章作词频统计，选出其...
复制链接

扫一扫