import pandas as pd
import numpy as np
def gx_matrix(vol_li):
# 整合一下,输入是df列,输出直接是矩阵
names = locals()
all_col0 = [] # 用来后续求所有字段的集合
for row in vol_li:
all_col0 += row
for each in row: #对每行的元素进行处理,存在该字段字典的话,再进行后续判断,否则创造该字段字典
try:
for each1 in row: # 对已存在字典,循环该行每个元素,存在则在已有次数上加一,第一次出现创建键值对“字段:1”
try:
names['dic_' + each][each1] = names['dic_' + each][each1] + 1 # 尝试,一起出现过的话,直接加1
except:
names['dic_' + each][each1] = 1 # 没有的话,第一次加1
except:
names['dic_' + each] = dict.fromkeys(row, 1) # 字段首次出现,创造字典
# 根据生成的计数字典生成矩阵
all_col = list(set(all_col0)) # 所有的字段(所有动物的集合)
all_col.sort(reverse=False) # 给定词汇列表排序排序,为了和生成空矩阵的横向列名一致
df_final0 = pd.DataFrame(columns=all_col) # 生成空矩阵
for each in all_col: # 空矩阵中每列,存在给字段字典,转为一列存入矩阵,否则先创造全为零的字典,再填充进矩阵
try:
temp = pd.DataFrame(names['dic_' + each], index=[each])
except:
names['dic_' + each] = dict.fromkeys(all_col, 0)
temp = pd.DataFrame(names['dic_' + each], index=[each])
df_final0 = pd.concat([df_final0, temp]) # 拼接
df_final = df_final0.fillna(0)
return df_final
if __name__ == '__main__':
temp1 = []
stop_words = '《,》,“,?,”,[,],",,,。,:,.,(,)'.split(',')
print(stop_words)
with open('./标题.txt','r',encoding='gbk') as f:
con = f.readlines()
for i in con:
for word in jieba.lcut(i):
if word not in stop_words:
if word.strip()!='':
temp1.append(word.strip())
temp_all = [temp1]
vol_li = pd.Series(temp_all)
df_matrix = gx_matrix(vol_li)
print(df_matrix)
df_matrix.to_csv(r'.\词频共现.csv')
```
## 方法二
```python
import os
import xlrd
import re
from pprint import pprint as pt
from tqdm import tqdm
import numpy as np
import jieba
from jieba.posseg import dt
allow_pos = frozenset(('ns', 'n', 'vn', 'v','a'))
def buildmatrix(x, y):
return [[0 for j in range(y)] for i in range(x)]
def dic(keygroup):
#keygroup = readxls(xlspath)
keytxt = '/'.join(keygroup)
keyfir = keytxt.split('/')
print(keyfir)
keylist = list(set([key for key in keytxt.split('/') if key != '']))
keydic = {}
pos = 0
for i in keylist:
pos = pos+1
keydic[pos] = str(i)
return keydic
def showmatrix(matrix):
matrixtxt = ''
count = 0
for i in tqdm(range(0, len(matrix))):
for j in range(0, len(matrix)):
matrixtxt = matrixtxt+str(matrix[i][j])+'\t'
matrixtxt = matrixtxt[:-1]+'\n'
count = count+1
#print('No.'+str(count)+' had been done!')
return matrixtxt
def inimatrix(matrix, dic, length):
matrix[0][0] = '+'
for i in range(1, length):
matrix[0][i] = dic[i]
for i in range(1, length):
matrix[i][0] = dic[i]
# pt(matrix)
return matrix
def countmatirx(matrix, dic, mlength, keylis):
for i in range(1, mlength):
for j in range(1, mlength):
count = 0
for k in keylis:
ech = k
# print(ech)
if str(matrix[0][i]) in ech and str(matrix[j][0]) in ech and str(matrix[0][i]) != str(matrix[j][0]):
count = count+1
else:
continue
matrix[i][j] = str(count)
return matrix
def flag_filter( wp):
#
return (wp.flag in allow_pos) and (len(wp.word.strip()) >= 2)
def key_words(sentense):
words = tuple(dt.cut(sentense))
keydic = {}
pos = 0
key_word=set()
for i, wp in enumerate(words):
if flag_filter(wp):
key_word.add(wp.word)
for i in key_word:
pos = pos+1
keydic[pos] = i
return keydic
def data_loader(filepath):
# 读取数据
keylis=[]
with open(filepath,'r',encoding="utf-8") as f:
content = f.readlines()
sen = ''
for i in content:
keylis.append([j for j in jieba.cut(i.strip())])
sen += i.strip()
keydic = key_words(sen)
return keydic,keylis
def main():
# main函数的前四行可以进行调整,生成的共线矩阵需要输入不同文章的列表和所有文章共用的字典
filepath = r'aa.txt'
keydic,keylis = data_loader(filepath)
print("keylis:",keylis)
print("keydic:",keydic)
length = len(keydic)+1
# 初始化词频共现矩阵
matrix = buildmatrix(length, length)
# 写入词频共线矩阵的行列关键词
matrix = inimatrix(matrix, keydic, length)
# 写入词频共线矩阵的行列关键词的关系
matrix = countmatirx(matrix, keydic, length, keylis)
matrixtxt = showmatrix(matrix)
pt(matrix)
np.savetxt('词频共现矩阵.csv', matrix, delimiter = ',', fmt='%s')
print("词频共现矩阵已生成!")
if __name__ == '__main__':
main()
基于Python词频共现矩阵的计算方法
最新推荐文章于 2024-07-14 22:18:32 发布