基于Python词频共现矩阵的计算方法

import pandas as pd
import numpy as np
def gx_matrix(vol_li):
    # 整合一下,输入是df列,输出直接是矩阵
    names = locals()
    all_col0 = []   # 用来后续求所有字段的集合
    for row in vol_li:
        all_col0 += row
        for each in row: #对每行的元素进行处理,存在该字段字典的话,再进行后续判断,否则创造该字段字典
            try:
                for each1 in row:  # 对已存在字典,循环该行每个元素,存在则在已有次数上加一,第一次出现创建键值对“字段:1”
                    try:
                        names['dic_' + each][each1] = names['dic_' + each][each1] + 1  # 尝试,一起出现过的话,直接加1
                    except:
                        names['dic_' + each][each1] = 1  # 没有的话,第一次加1
            except:
                names['dic_' + each] = dict.fromkeys(row, 1)  # 字段首次出现,创造字典


    # 根据生成的计数字典生成矩阵
    all_col = list(set(all_col0))   # 所有的字段(所有动物的集合)
    all_col.sort(reverse=False)  # 给定词汇列表排序排序,为了和生成空矩阵的横向列名一致
    df_final0 = pd.DataFrame(columns=all_col)  # 生成空矩阵
    for each in all_col:  # 空矩阵中每列,存在给字段字典,转为一列存入矩阵,否则先创造全为零的字典,再填充进矩阵
        try:
            temp = pd.DataFrame(names['dic_' + each], index=[each])
        except:
            names['dic_' + each] = dict.fromkeys(all_col, 0)
            temp = pd.DataFrame(names['dic_' + each], index=[each])
        df_final0 = pd.concat([df_final0, temp])  # 拼接
    df_final = df_final0.fillna(0)
    return df_final


if __name__ == '__main__':
    temp1 = []
	stop_words = '《,》,“,?,”,[,],",,,。,:,.,(,)'.split(',')
	print(stop_words)
	with open('./标题.txt','r',encoding='gbk') as f:
	    con = f.readlines()
	    for i in con:
	        for word in jieba.lcut(i):
	            if word not in stop_words:
	                if word.strip()!='':
	                    temp1.append(word.strip())
    temp_all = [temp1]
    vol_li = pd.Series(temp_all)
    df_matrix = gx_matrix(vol_li)
    print(df_matrix)
    df_matrix.to_csv(r'.\词频共现.csv')
    ```
## 方法二

```python
import os
import xlrd
import re
from pprint import pprint as pt
from tqdm import tqdm
import numpy as np
import jieba
from jieba.posseg import dt

allow_pos = frozenset(('ns', 'n', 'vn', 'v','a'))

def buildmatrix(x, y):
    return [[0 for j in range(y)] for i in range(x)]


def dic(keygroup):
    #keygroup = readxls(xlspath)
    keytxt = '/'.join(keygroup)
    keyfir = keytxt.split('/')
    print(keyfir)
    keylist = list(set([key for key in keytxt.split('/') if key != '']))
    keydic = {}
    pos = 0
    for i in keylist:
        pos = pos+1
        keydic[pos] = str(i)
    return keydic


def showmatrix(matrix):
    matrixtxt = ''
    count = 0
    for i in tqdm(range(0, len(matrix))):
        for j in range(0, len(matrix)):
            matrixtxt = matrixtxt+str(matrix[i][j])+'\t'
        matrixtxt = matrixtxt[:-1]+'\n'
        count = count+1
        #print('No.'+str(count)+' had been done!')
    return matrixtxt


def inimatrix(matrix, dic, length):
    matrix[0][0] = '+'
    for i in range(1, length):
        matrix[0][i] = dic[i]
    for i in range(1, length):
        matrix[i][0] = dic[i]
    # pt(matrix)
    return matrix


def countmatirx(matrix, dic, mlength, keylis):
    for i in range(1, mlength):
        for j in range(1, mlength):
            count = 0
            for k in keylis:
                ech = k
                # print(ech)
                if str(matrix[0][i]) in ech and str(matrix[j][0]) in ech and str(matrix[0][i]) != str(matrix[j][0]):
                    count = count+1
                else:
                    continue
            matrix[i][j] = str(count)
    return matrix


def flag_filter( wp):
    # 
    return  (wp.flag in allow_pos) and (len(wp.word.strip()) >= 2)

def key_words(sentense):
    words = tuple(dt.cut(sentense))
    keydic = {}
    pos = 0
    key_word=set()
    for i, wp in enumerate(words):
        if flag_filter(wp):
            key_word.add(wp.word)
    for i in key_word:
        pos = pos+1
        keydic[pos] = i
    return keydic

def data_loader(filepath):
     # 读取数据
    keylis=[]
    with open(filepath,'r',encoding="utf-8") as f:
        content = f.readlines()        
        sen = ''
        for i in content:
            keylis.append([j for j in jieba.cut(i.strip())])    
            sen += i.strip()
        keydic = key_words(sen)   
    return keydic,keylis

def main():
	# main函数的前四行可以进行调整,生成的共线矩阵需要输入不同文章的列表和所有文章共用的字典
    filepath = r'aa.txt'
    keydic,keylis = data_loader(filepath)
    print("keylis:",keylis)
    print("keydic:",keydic)
   
    length = len(keydic)+1
    # 初始化词频共现矩阵
    matrix = buildmatrix(length, length)
    # 写入词频共线矩阵的行列关键词
    matrix = inimatrix(matrix, keydic, length)
    # 写入词频共线矩阵的行列关键词的关系
    matrix = countmatirx(matrix, keydic, length, keylis)
    matrixtxt = showmatrix(matrix)
    pt(matrix)
    np.savetxt('词频共现矩阵.csv', matrix, delimiter = ',', fmt='%s')  
    print("词频共现矩阵已生成!")

if __name__ == '__main__':
    main()
  • 2
    点赞
  • 17
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值