import sys
import time
import seaborn as sns
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns', None)
cbg_activate_info_df = pd.read_csv('cbg_activate_info_df.csv')
person_visits_list = cbg_activate_info_df['person_visits'].tolist()
person_visits_list = [round(x) for x in person_visits_list]
cbg_activate_info_df['person_visits'] = person_visits_list
cbg_activate_info_df = cbg_activate_info_df[cbg_activate_info_df['person_visits'] <= 200]
cbg_activate_info_df = cbg_activate_info_df.sort_values(by='person_visits')
cbg_sorted_list = cbg_activate_info_df['census_block_group'].tolist()
cbg_sorted_list = [str(x) for x in cbg_sorted_list]
cbg_ordered_and_visits_df = pd.read_csv('cbg_ordered_and_visits_list.csv')
cbg_id_ordered_list = cbg_ordered_and_visits_df['cbg_id_ordered'].tolist()
cbg_id_ordered_list = [str(x) for x in cbg_id_ordered_list]
cbg_serial_num_dict = {} # 这个dict非常重要包含了顺序***
for a in cbg_sorted_list:
for i in range(len(cbg_id_ordered_list)):
if a == cbg_id_ordered_list[i]:
cbg_serial_num_dict[a] = i
person_visits_list = cbg_activate_info_df['person_visits'].tolist()
data_array = [[x] for x in person_visits_list]
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data_array)
scaled_data_list = [round(x[0], 2) for x in scaled_data]
# percentiles = np.percentile(scaled_data_list, [10, 20, 30, 40, 50, 60, 70, 80, 90])
bins = np.array([0.0, 0.06, 0.1, 0.13, 0.17, 0.21, 0.27, 0.34, 0.44, 0.58])
groups = [list(np.where(np.digitize(scaled_data_list, bins) == i)[0]) for i in range(1, len(bins) + 1)]
list_of_lists = []
with open('cbg_cbg_list.txt', 'r') as file:
for line in file:
# 去除换行符,然后将每行解析为一个列表
current_list = eval(line.strip())
list_of_lists.append(current_list)
all_square_value_list = []
start = time.perf_counter()
for group1 in groups: # 1.对x轴开bins
x_axis_square_value_list = []
for group2 in groups: # 2.对y轴开bins 此时形成了100个方格
total_visits = 0
g1_x_num = len(group1)
g2_y_num = len(group2)
for g1 in group1: # 3.对每个方格的x轴遍历
cbg_matrix_row_index = cbg_serial_num_dict[cbg_sorted_list[g1]] # 矩阵的行索引
for g2 in group2: # 4.对每个方格的y轴遍历
cbg_matrix_column_index = cbg_serial_num_dict[cbg_sorted_list[g2]] # 矩阵的列索引
total_visits += list_of_lists[cbg_matrix_row_index][cbg_matrix_column_index]
square_value = round(total_visits / (g1_x_num * g2_y_num), 1) # 计算具体每个小方格内的数据 这里取了均值
x_axis_square_value_list.append(square_value)
all_square_value_list.append(x_axis_square_value_list)
end = time.perf_counter()
print('程序运行时间:' + format(end - start))
all_square_value_list = list(reversed(all_square_value_list))
sns.heatmap(all_square_value_list, cmap='hot', annot=False)
plt.xticks(np.arange(0.5, 10.5, 1), ['0', '0.06', '0.1', '0.13', '0.17', '0.21', '0.27', '0.34', '0.44', '0.58'])
plt.yticks(np.arange(0.5, 10.5, 1), ['0.58', '0.44', '0.34', '0.27', '0.21', '0.17', '0.13', '0.1', '0.06', '0'])
plt.title('directed')
plt.xlabel('activation_x')
plt.ylabel('activation_y')
plt.show()
核心代码列出如下:主要是sns.heatmap函数的使用,参数annot表示是否把数据显示在热力图的小方块中。
sns.heatmap(all_square_value_list, cmap='hot', annot=False)
plt.xticks(np.arange(0.5, 10.5, 1), ['0', '0.06', '0.1', '0.13', '0.17', '0.21', '0.27', '0.34', '0.44', '0.58'])
plt.yticks(np.arange(0.5, 10.5, 1), ['0.58', '0.44', '0.34', '0.27', '0.21', '0.17', '0.13', '0.1', '0.06', '0'])
plt.title('directed')
plt.xlabel('activation_x')
plt.ylabel('activation_y')
plt.show()
理解热力图的输入:all_square_value_list本质是一个二维数组,热力图就是把这个二维数组的大小进行比较,把数字的大小用颜色的差异表示出来。热力图本质上可以很清楚地看出哪里的数据较大,哪里的数据较小,比较直观,可用于分析正负相关等。
把做好的数据存到一个矩阵中,用这个矩阵直接画热力图即可,重点关注矩阵的排序方式,横轴纵轴一定要按照自己想象的那样排序,比较重要。
可以根据实际情况把热力图的x轴和y轴标签更换:plt.xticks函数。