#coding:utf-8 import codecs import numpy as np import pandas as pd import math def read(path):#读取数据,获取数据中词语出现的频数的字典 f = open(path, 'r', encoding='utf-8') data = [] #把出现的词语存在一个列表下 for line in f: wordlist = line.strip().split(' ') for i in wordlist: if i != '': data.append(i) word_dic = {} #把所有词语的列表转化为字典 for j in data: if j in word_dic.keys(): word_dic[j] += 1 else: word_dic[j] = 1 return word_dic def get_set_key(word_dic, threshold, threshold2): # 选取频数大于等于Threshold(阈值)的关键词构建一个集合,用于作为共现矩阵的首行和首列 wf = {k: v for k, v in word_dic.items() if v >= threshold and v <= threshold2} set_key_list = [] for a in sorted(wf.items(), key=lambda item: item[1], reverse=True): set_key_list.append(a[0]) # 把排序的关键词语写入集合当中 return set_key_list def format_data(path1, set_key_list): # 格式化需要计算的数据,将原始数据格式转换成二维数组 data = open(path1, 'r', encoding='utf-8') formated_data = [] #所有关键词的列表,没有重复词语 for ech in data: ech_line = ech.strip().split(' ') # 出现在每条数据中的词语的列表 temp = [] # 筛选出format_data中属于关键词集合的词语 for e in ech_line: if e in set_key_list: temp.append(e) ech_line = temp ech_line = list(set(filter(lambda x: x !='', ech_line))) # set去掉重复数据 formated_data.append(ech_line) return formated_data # 每条数据组成的关键词的列表,没有重复词语 def build_matirx(set_key_list): # 建立矩阵,矩阵的高度和宽度为关键词集合的长度+1 edge = len(set_key_list)+1 matrix = [[0 for j in range(edge)] for i in range(edge)] return matrix def init_matrix(set_key_list, matrix): # 初始化矩阵,将关键词集合赋值给第一列和第二列 matrix[0][1:] = np.array(set_key_list) matrix = list(map(list, zip(*matrix))) matrix[0][1:] = np.array(set_key_list) return matrix def count_matrix(matrix, formated_data, word_dic): # 计算各个关键词共现次数 keywordlist=matrix[0][1:] # 列出所有关键词 appeardict={} # 每个关键词与[出现在的行(formated_data)的list] 组成的dictionary for w in keywordlist: appearlist = [] i = 0 for each_line in formated_data: # formatde_data每条数据组成的列表,遍历没一条数据 if w in each_line: appearlist.append(i) i += 1 appeardict[w] = appearlist # 关键词列表中的每个词出现在哪些文本中 for row in range(1, len(matrix)): # 遍历矩阵第一行,跳过下标为0的元素 for col in range(1, len(matrix)): # 遍历矩阵第一列,跳过下标为0的元素,实际上就是跳过matrix中下标为[0][0]的元素,因为[0][0]为空,不为关键词 if col >= row: # 仅计算上半个矩阵 if matrix[0][row] == matrix[col][0]: # 如果取出的行关键词和取出的列关键词相同,则其对应的共现次数为0,即矩阵对角线为0 matrix[col][row] = str(0) else: counter = len(set(appeardict[matrix[0][row]]) & set(appeardict[matrix[col][0]])) # 计算两个集合共现的次数 if counter == 0: matrix[col][row] = 0 else: matrix[col][row] = counter #matrix[col][row] = str(math.log2((counter*8)/word_dic[matrix[0][row]]/word_dic[matrix[col][0]])) else: matrix[col][row] = 0 return matrix def main(): path = r'C:/Users/Administrator/Desktop/test920.txt' path1 = r'C:/Users/Administrator/Desktop/test920.txt' m = read(path) set_key_list = get_set_key(m, 1, 1800) formated_data = format_data(path1, set_key_list) matrix = build_matirx(set_key_list) matrix = init_matrix(set_key_list, matrix) result_matrix = count_matrix(matrix, formated_data, m) #output_path = r'C:\Users\Administrator\Desktop\2.txt' #np.savetxt(output_path, result_matrix,fmt=('%s,'*len(matrix))[:-1]) df = pd.DataFrame(result_matrix) df.to_csv('C:/Users/Administrator/Desktop/a.csv', header=None, encoding='utf-8_sig', index=None) main()