右边为原始的散点图,左边为绘制的热力图。
使用geopands绘制热力图
1、核心代码
import numpy as np
from scipy import ndimage
import matplotlib.pyplot as plt
import matplotlib.image as image
ax1 = plt.subplot(121)
# 边框绘制
boundary_ax = gdf_counter.plot(color='white', edgecolor='black',figsize=(5,5), alpha=0.2, ax = ax1)
boundary_ax.axes.xaxis.set_ticklabels([]) #取消x轴的刻度值
boundary_ax.axes.yaxis.set_ticklabels([]) #取消轴的刻度值
boundary_ax.tick_params(bottom=False, top=False, left=False, right=False) #边上的隐藏刻度线
bins = (500)
smoothing=5.5
cmap='jet'
# cmap='Blues'
df_test_day = df_index_date.loc["2001-01-01", :]
geom = [shapely.geometry.Point(xy) for xy in zip(df_test_day.Longitude, df_test_day.Latitude)] #先经度,后纬度
gdf_point = geopandas.GeoDataFrame(df_test_day, geometry=geom)
print("gdf_point.shape: ", gdf_point.shape)
lat = gdf_point.loc[:, "Latitude"]
lon = gdf_point.loc[:, "Longitude"]
heatmap, _0, _1 = np.histogram2d(lat, lon, bins=bins, range=[[boundary_ax.get_ybound()[0],boundary_ax.get_ybound()[1]], [boundary_ax.get_xbound()[0], boundary_ax.get_xbound()[1]]])
extent=[boundary_ax.get_xbound()[0], boundary_ax.get_xbound()[1], boundary_ax.get_ybound()[1], boundary_ax.get_ybound()[0]] # (left, right, bottom, top) in data coordinates,计算点的经纬度直方图,差不多就是画表格,然后根据点的多少,确定每个格子里有多少个点。
logheatmap = np.log(heatmap)
logheatmap[np.isneginf(logheatmap)] = 0 # 测试元素的负无穷大,返回结果作为布尔数组
logheatmap = ndimage.filters.gaussian_filter(logheatmap, smoothing, mode='nearest') # 高斯平滑,然有数周围的点,变得比较接近
heat_ax=plt.imshow(logheatmap, cmap=cmap, extent=extent)
heat_ax.axes.xaxis.set_ticklabels([]) #取消x轴的刻度值
heat_ax.axes.yaxis.set_ticklabels([]) #取消轴的刻度值
heat_ax.axes.tick_params(bottom=False, top=False, left=False, right=False) #边上的隐藏刻度线
heat_ax = plt.gca().invert_yaxis()
# plt.gca().invert_xaxis()
# 第二个图
ax2 = plt.subplot(122)
boundary_ax = gdf_counter.plot(color='white', edgecolor='black',figsize=(5,5), alpha=0.5, ax=ax2)
boundary_ax.axes.xaxis.set_ticklabels([]) #取消x轴的刻度值
boundary_ax.axes.yaxis.set_ticklabels([]) #取消轴的刻度值
boundary_ax.tick_params(bottom=False, top=False, left=False, right=False) #边上的隐藏刻度线
bp_ax = gdf_point.plot(ax=boundary_ax, alpha=0.15, color='red')
2、项目工程
2.0 mytool库的手动构建
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
def get_all_csv_path(dirname):
'''
获取dirname目录下的所有文件
'''
file_path_list = []
file_names = os.listdir(dirname) #获取该目录下的所有文件
file_names.sort(key = lambda x : int(x[-6:-4])) #例:Chicago_Crimes_2001_to_2004.csv
for file_name in file_names:
file_path_list.append(os.path.join(dirname, file_name))
return file_path_list
def from_list_read_csv(file_path_list):
'''
读取file_path_list下所有的csv文件
注意,因为这些个csv文件中的格式不太一样,就是第一个2001到2014的只有22列,其他的有23列
!!!!!!!!!!!所以在处理数据的时候需要自己检查一下自己的数据格式!!!!!!!!!!!!!
'''
df_all = pd.read_csv(file_path_list[0], delimiter=',', error_bad_lines=False, parse_dates=['Date']) # error_bad_lines=False跳过报错行,parse_dates=['Date']将Date这一列当作date数据格式读取
print("df_all.shape:", df_all.shape)
for file_path in file_path_list[1:]:
print("read_file_path: ", file_path)
df = pd.read_csv(file_path, delimiter=',', error_bad_lines=False, parse_dates=['Date'])
print("df.shape: ", df.shape)
df_all = pd.concat([df_all, df.iloc[:,1:]], ignore_index = True) # ignore_index = false 指的是重新编码最前面的那个序号,!!!!!!后面3个有22列,所以要忽略掉第一列!!!!!!
print("df_all.shape:", df_all.shape)
return df_all
def read_csv(dirname, drop_NAN = False):
# 获取目标目录下的所有文件路径
file_path_list = get_all_csv_path(dirname)
# 读取所有文件,为dataframe格式
df_all = from_list_read_csv(file_path_list)
# 是否删除不全的记录(即出现NAN的记录行)
if drop_NAN :
df_all = df_all.dropna(axis=0, how='any')
df_all = df_all.reset_index(drop=True) #重新编号
return df_all
2.1数据预处理
# 先预处理一下数据
import mytools
import pandas as pd
import torch
df_all = mytools.read_csv("./data/origin", drop_NAN=True)
torch.save(df_all, "./data/process/data_2001_to_2017.pt")
我的数据文件夹组织如下:
2.2 实用ipynb格式,分步骤详细解释
# 导入各种包
import mytools
import pandas as pd
import torch
import numpy as np
pd.options.display.width = 800
pd.set_option('display.max_columns', 11) # 设置显示最大列
pd.set_option('display.max_row', 15) # 设置显示最大行
#读取geojson数据,我这里是芝加哥2001到2017年的犯罪数据josn,自己预处理了一下
df_all = torch.load("./data/process/data_2001_to_2017.pt")
# 我这里只研究日期与坐标的时空关系,所以提取相应的列
df_date_position = df_all.loc[:, ["Date", "Latitude", "Longitude"]]
# 将日期设置为索引
df_date_position["One_Date"] = df_date_position['Date'].dt.to_period('D')
df_index_date = df_date_position.iloc[:, 1:]
df_index_date.set_index("One_Date", inplace = True)
# 选取其中的一天来进行研究
import geopandas
import shapely
import plotly.express as px
gdf_counter = geopandas.read_file('./data/geojson/chicago.geojson')#geojson路径
print(gdf_counter.shape)
# 将pandas数据转化成geopandas
df_test_day = df_index_date.loc["2001-01-01", :]
geom = [shapely.geometry.Point(xy) for xy in zip(df_test_day.Longitude, df_test_day.Latitude)] #先经度,后纬度
gdf_point = geopandas.GeoDataFrame(df_test_day, geometry=geom)
print(gdf_point.shape)
# 这里要获取一下所有的数据,有点慢,为的就是找到所有数据中的极值,方便画图
gdf_all_point = geopandas.GeoDataFrame(df_index_date, geometry=[shapely.geometry.Point(xy) for xy in zip(df_index_date.Longitude, df_index_date.Latitude)])
# 先画一个简单的散点图
boundary_ax = gdf_counter.plot(color='white', edgecolor='black',figsize=(5,5), alpha=0.5)
boundary_ax.axes.xaxis.set_ticklabels([]) #取消x轴的刻度值
boundary_ax.axes.yaxis.set_ticklabels([]) #取消轴的刻度值
boundary_ax.tick_params(bottom=False, top=False, left=False, right=False) #边上的隐藏刻度线
bp_ax = gdf_point.plot(ax=boundary_ax)
print(bp_ax.get_ybound())
print(bp_ax.get_xbound())
print(boundary_ax.get_ybound())
print(boundary_ax.get_xbound())
print(gdf_point.describe())
# 热力图的绘制
import numpy as np
from scipy import ndimage
import matplotlib.pyplot as plt
import matplotlib.image as image
import matplotlib
nor = matplotlib.colors.Normalize(vmin=0, vmax=0.015, clip=True) #需要引入归一化,不然,没啥犯罪的日子都不显示
plt.figure(dpi=300,figsize=(10,8))
ax1 = plt.subplot(121)
# 边框
boundary_ax = gdf_counter.plot(color='white', edgecolor='black',figsize=(5,5), alpha=0.1, ax = ax1)
boundary_ax.axes.xaxis.set_ticklabels([]) #取消x轴的刻度值
boundary_ax.axes.yaxis.set_ticklabels([]) #取消轴的刻度值
boundary_ax.tick_params(bottom=False, top=False, left=False, right=False) #边上的隐藏刻度线
bins = (500)
smoothing=5.5
cmap='jet'
# cmap='Blues'
# cmap='Reds'
df_test_day = df_index_date.loc["2001-01-01", :]
geom = [shapely.geometry.Point(xy) for xy in zip(df_test_day.Longitude, df_test_day.Latitude)] #先经度,后纬度
gdf_point = geopandas.GeoDataFrame(df_test_day, geometry=geom)
print("gdf_point.shape: ", gdf_point.shape)
lat = gdf_point.loc[:, "Latitude"]
lon = gdf_point.loc[:, "Longitude"]
heatmap, _0, _1 = np.histogram2d(lat, lon, bins=bins, range=[[boundary_ax.get_ybound()[0],boundary_ax.get_ybound()[1]], [boundary_ax.get_xbound()[0], boundary_ax.get_xbound()[1]]])
extent=[boundary_ax.get_xbound()[0], boundary_ax.get_xbound()[1], boundary_ax.get_ybound()[1], boundary_ax.get_ybound()[0]] # (left, right, bottom, top) in data coordinates
logheatmap = np.log(heatmap+1) #!!!!!加个1是为了让,一个元素的直方图,在取log后值不为0
logheatmap[np.isneginf(logheatmap)] = 0 # 测试元素的负无穷大,返回结果作为布尔数组
# print(logheatmap)
logheatmap = ndimage.filters.gaussian_filter(logheatmap, smoothing, mode='nearest') # 高斯平滑
heat_ax=plt.imshow(logheatmap, cmap=cmap, extent=extent, norm=nor)
heat_ax.axes.xaxis.set_ticklabels([]) #取消x轴的刻度值
heat_ax.axes.yaxis.set_ticklabels([]) #取消轴的刻度值
heat_ax.axes.tick_params(bottom=False, top=False, left=False, right=False) #边上的隐藏刻度线
heat_ax = plt.gca().invert_yaxis()
# plt.gca().invert_xaxis()
# 第二个图
ax2 = plt.subplot(122)
boundary_ax = gdf_counter.plot(color='white', edgecolor='black',figsize=(5,5), alpha=0.5, ax=ax2)
boundary_ax.axes.xaxis.set_ticklabels([]) #取消x轴的刻度值
boundary_ax.axes.yaxis.set_ticklabels([]) #取消轴的刻度值
boundary_ax.tick_params(bottom=False, top=False, left=False, right=False) #边上的隐藏刻度线
bp_ax = gdf_point.plot(ax=boundary_ax, alpha=0.15, color='red')
热力图与散点图的比较:
'''
LinearSegmentedColormap函数自定义颜色的变化,使得它符合自己的审美
'''
from pylab import *
from matplotlib.colors import ListedColormap,LinearSegmentedColormap
clist=['white','blue','yellow','red','darkred']
newcmp = LinearSegmentedColormap.from_list('chaos',clist)
plt.figure(dpi=300,figsize=(10,8))
ax1 = plt.subplot(121)
# 边框
boundary_ax = gdf_counter.plot(color='white', edgecolor='black',figsize=(5,5), alpha=0.1, ax = ax1)
boundary_ax.axes.xaxis.set_ticklabels([]) #取消x轴的刻度值
boundary_ax.axes.yaxis.set_ticklabels([]) #取消轴的刻度值
boundary_ax.tick_params(bottom=False, top=False, left=False, right=False) #边上的隐藏刻度线
heat_ax = plt.imshow(logheatmap, cmap=newcmp, extent=extent, norm=nor)
heat_ax.axes.xaxis.set_ticklabels([]) #取消x轴的刻度值
heat_ax.axes.yaxis.set_ticklabels([]) #取消轴的刻度值
heat_ax.axes.tick_params(bottom=False, top=False, left=False, right=False) #边上的隐藏刻度线
heat_ax = plt.gca().invert_yaxis()
# 第二个图
ax2 = plt.subplot(122)
boundary_ax = gdf_counter.plot(color='white', edgecolor='black',figsize=(5,5), alpha=0.5, ax=ax2)
boundary_ax.axes.xaxis.set_ticklabels([]) #取消x轴的刻度值
boundary_ax.axes.yaxis.set_ticklabels([]) #取消轴的刻度值
boundary_ax.tick_params(bottom=False, top=False, left=False, right=False) #边上的隐藏刻度线
bp_ax = gdf_point.plot(ax=boundary_ax, alpha=0.15, color='red')