八、多元变量及数据分布可视化
1、多元变量及数据分布常用图形
-
少数变量
- 相关性——因果性,相似但不一定相同
-
多元变量
- 多个变量直接存在正相关行、负相关行和弱相关性
- 正相关:线条呈现平行
- 负相关:线条一直交叉(顶端与底端相连)
- 弱相关:方向不清晰
-
数据分布
-
通过不同的图表(例如:散点图、柱状图)来观察数据分布
-
箱线图
-
上四分位数于下四分位数之间的范围称为四分位间距
-
上/下边界
上 / 下 边 界 = 上 / 下 四 分 位 数 ± 1 1 2 四 分 位 间 距 上/下边界 = {上/下四分位数}\pm1\frac{1}{2}{四分位间距} 上/下边界=上/下四分位数±121四分位间距
-
-
2、多元变量案例
-
导入可视化所需的模块
import pandas as pd from pyecharts.globals import CurrentConfig, NotebookType CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB import pyecharts.options as opts from pyecharts.globals import ThemeType
-
平行折线图
from pyecharts.charts import Parallel # 导入数据 df_final = pd.read_csv('data/beijing_AQI_2018.csv') df_final = df_final[['AQI', 'AQI_rank', 'PM', 'Quality_grade']].values.tolist() parallel = ( Parallel(init_opts=opts.InitOpts(theme=ThemeType.DARK)) .add_schema( [ opts.ParallelAxisOpts(dim=0, name="AQI"), opts.ParallelAxisOpts(dim=1, name="AQI_rank"), opts.ParallelAxisOpts(dim=2, name="PM"), opts.ParallelAxisOpts( dim=3, name="Quality_grade", type_="category", data=["优", "良", "轻度污染", "中度污染", "重度污染", "严重污染"], ), ] ) .add("parallel", df_final[:50]) .set_global_opts(title_opts=opts.TitleOpts(title="北京空气质量平行折线图")) ) parallel.load_javascript() parallel.render_notebook()
-
散点矩阵图
import matplotlib.pyplot as plt import seaborn as sns # 数据准备 iris = pd.read_csv('data/iris.csv') # 用Seabron画成对关系 sns.pairplot(iris, hue='species') plt.show()
3、数据分布案例
- 直方图
- 箱线图
- 多个时间序列图
4、分类数据可视化实验
-
实验环境
- python=3.7.6
- pyecharts=1.7.1
- jupyterlab=1.2.6
-
2018北京AQI全年走势图
from pyecharts.charts import Line df = pd.read_csv('data/beijing_AQI_2018.csv') attr = df['Date'].values.tolist() v1 = df['AQI'].values.tolist() line = ( Line(init_opts=opts.InitOpts(theme=ThemeType.DARK)) .add_xaxis(attr) .add_yaxis("AQI值", v1, markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(type_='average')]), markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_='max'), opts.MarkPointItem(type_='min')]) ) .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) .set_global_opts(title_opts=opts.TitleOpts(title='2018年北京AQI全年走势图')) ) line.render_notebook()
-
2018北京PM2.5全年走势图
v1 = df['PM'].values.tolist() line = ( Line(init_opts=opts.InitOpts(theme=ThemeType.DARK)) .add_xaxis(attr) .add_yaxis("PM2.5值", v1, markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(type_='average')]), markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_='max'), opts.MarkPointItem(type_='min')]) ) .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) .set_global_opts(title_opts=opts.TitleOpts(title='2018年北京PM2.5全年走势图')) ) line.render_notebook()
-
2018年北京月均AQI走势图
import numpy as np dom = df[['Date', 'AQI']] list1 = [] for j in dom['Date']: time = j.split('/')[1] list1.append(time) df['month'] = list1 month_message = df.groupby(['month']) month_com = month_message['AQI'].agg(['mean']) month_com.reset_index(inplace=True) month_com_last = month_com.sort_index() attr = ['{}'.format(str(i) + '月') for i in range(1, 13)] v1 = np.array(month_com_last['mean']) v1 = [int(i) for i in v1] line = ( Line(init_opts=opts.InitOpts(theme=ThemeType.DARK)) .add_xaxis(attr) .add_yaxis("AQI月均值", v1, markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_='max'), opts.MarkPointItem(type_='min')]) ) .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) .set_global_opts(title_opts=opts.TitleOpts(title='2018年北京月均AQI走势图')) ) line.render_notebook()
-
2018年北京月均PM2.5走势图
dom = df[['Date', 'PM']] list1 = [] for j in dom['Date']: time = j.split('/')[1] list1.append(time) df['month'] = list1 month_message = df.groupby(['month']) month_com = month_message['PM'].agg(['mean']) month_com.reset_index(inplace=True) month_com_last = month_com.sort_index() attr = ['{}'.format(str(i) + '月') for i in range(1, 13)] v1 = np.array(month_com_last['mean']) v1 = [int(i) for i in v1] line = ( Line(init_opts=opts.InitOpts(theme=ThemeType.DARK)) .add_xaxis(attr) .add_yaxis("PM2.5月均值", v1, markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_='max'), opts.MarkPointItem(type_='min')]) ) .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) .set_global_opts(title_opts=opts.TitleOpts(title='2018年北京月均PM2.5走势图')) ) line.render_notebook()
-
2018年北京季度AQI箱型图
from pyecharts.charts import Boxplot dom = df[['Date', 'AQI']] data = [[], [], [], []] dom1, dom2, dom3, dom4 = data for i, j in zip(dom['Date'], dom['AQI']): time = i.split('/')[1] if time in ['1', '2', '3']: dom1.append(j) elif time in ['4', '5', '6']: dom2.append(j) elif time in ['7', '8', '9']: dom3.append(j) else: dom4.append(j) boxplot = Boxplot(init_opts=opts.InitOpts(theme=ThemeType.DARK)) boxplot = ( boxplot.add_xaxis(['第一季度', '第二季度', '第三季度', '第四季度']) .add_yaxis("", boxplot.prepare_data([dom1, dom2, dom3, dom4])) .set_global_opts(title_opts=opts.TitleOpts(title='2018年北京季度AQI箱型图')) ) boxplot.render_notebook()
-
2018年北京季度PM2.5箱型图
dom = df[['Date', 'PM']] data = [[], [], [], []] dom1, dom2, dom3, dom4 = data for i, j in zip(dom['Date'], dom['PM']): time = i.split('/')[1] if time in ['1', '2', '3']: dom1.append(j) elif time in ['4', '5', '6']: dom2.append(j) elif time in ['7', '8', '9']: dom3.append(j) else: dom4.append(j) boxplot = Boxplot(init_opts=opts.InitOpts(theme=ThemeType.DARK)) boxplot = ( boxplot.add_xaxis(['第一季度', '第二季度', '第三季度', '第四季度']) .add_yaxis("", boxplot.prepare_data([dom1, dom2, dom3, dom4])) .set_global_opts(title_opts=opts.TitleOpts(title='2018年北京季度PM2.5箱型图')) ) boxplot.render_notebook()
-
2018年北京全年空气质量情况
from pyecharts.charts import Pie rank_message = df.groupby(['Quality_grade']) rank_com = rank_message['Quality_grade'].agg(['count']) rank_com.reset_index(inplace=True) rank_com_last = rank_com.sort_values('count', ascending=False) attr = rank_com_last['Quality_grade'] v1 = rank_com_last['count'] pie = ( Pie(init_opts=opts.InitOpts(theme=ThemeType.DARK)) .add("空气质量", [list(z) for z in zip(attr, v1)], radius=[130, 180], tooltip_opts=opts.TooltipOpts(textstyle_opts=opts.TextStyleOpts(align='center'), formatter='{a}'+'<br/>'+'{b}: {c} ({d}%)')) .set_global_opts(title_opts=opts.TitleOpts(title='2018年北京全年空气质量情况', pos_left='center'), legend_opts=opts.LegendOpts(orient='vertical', pos_top='5%', pos_left='2%') ) ) pie.render_notebook()
-
2018年北京PM2.5指数日历图
import datetime import random from pyecharts.charts import Calendar dom = df[['Date', 'PM']] list1 = [] for i, j in zip(dom['Date'], dom['PM']): time_list = i.split('/') time = datetime.date(int(time_list[0]), int(time_list[1]), int(time_list[2])) PM = int(j) list1.append([str(time), int(PM)]) calendar=( Calendar(init_opts=opts.InitOpts(bg_color='white', height='300px')) .add("PM2.5", list1, calendar_opts=opts.CalendarOpts(range_="2018")) .set_global_opts( title_opts=opts.TitleOpts(title="2018年北京PM2.5指数日历图"), visualmap_opts=opts.VisualMapOpts( max_=max(dom['PM']), min_=min(dom['PM']), orient="horizontal", is_piecewise=True, pos_top="230px", pos_left="100px", ) ) ) calendar.render_notebook()
-
2018年北上广深AQI全年走势图
city_name = ['beijing', 'shanghai', 'guangzhou', 'shenzhen'] cityes_AQI = [] for i in range(4): filename = 'data/' + city_name[i] + '_AQI' + '_2018.csv' aqi_data = pd.read_csv(filename) get_data = aqi_data[['Date', 'AQI']] month_for_data = [] for j in get_data['Date']: time = j.split('/')[1] month_for_data.append(time) # 获取每行数据的月份 aqi_data['Month'] = month_for_data # 求每个月AQI平均值 month_data = aqi_data.groupby(['Month']) month_AQI = month_data['AQI'].agg(['mean']) month_AQI.reset_index(inplace = True) month_AQI_average = month_AQI.sort_index() # 获取每个城市月均AQI的数据,转化为int数据类型 month_AQI_data = np.array(month_AQI_average['mean']) month_AQI_data_int = [int(i) for i in month_AQI_data] cityes_AQI.append(month_AQI_data_int) months = ['{}'.format(str(i) + '月') for i in range(1, 13)] line = ( Line(init_opts=opts.InitOpts(theme=ThemeType.DARK)) .add_xaxis(months) .add_yaxis("北京", cityes_AQI[0]) .add_yaxis("上海", cityes_AQI[1]) .add_yaxis("广州", cityes_AQI[2]) .add_yaxis("深圳", cityes_AQI[3]) .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) .set_global_opts(title_opts=opts.TitleOpts(title='2018年北上广深AQI全年走势图'), legend_opts=opts.LegendOpts(pos_top='8%') ) ) line.render_notebook()
-
2018年北上广深PM2.5全年走势图
cityes_PM = [] for i in range(4): filename = 'data/' + city_name[i] + '_AQI' + '_2018.csv' pm_data = pd.read_csv(filename) get_data = pm_data[['Date', 'PM']] month_for_data = [] for j in get_data['Date']: time = j.split('/')[1] month_for_data.append(time) # 获取每行数据的月份 pm_data['Month'] = month_for_data # 求每个月PM平均值 month_data = pm_data.groupby(['Month']) month_PM = month_data['PM'].agg(['mean']) month_PM.reset_index(inplace = True) month_PM_average = month_PM.sort_index() # 获取每个城市月均PM的数据,转化为int数据类型 month_PM_data = np.array(month_PM_average['mean']) month_PM_data_int = [int(i) for i in month_PM_data] cityes_PM.append(month_PM_data_int) months = ['{}'.format(str(i) + '月') for i in range(1, 13)] line = ( Line(init_opts=opts.InitOpts(theme=ThemeType.DARK)) .add_xaxis(months) .add_yaxis("北京", cityes_PM[0]) .add_yaxis("上海", cityes_PM[1]) .add_yaxis("广州", cityes_PM[2]) .add_yaxis("深圳", cityes_PM[3]) .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) .set_global_opts(title_opts=opts.TitleOpts(title='2018年北上广深PM2.5全年走势图'), legend_opts=opts.LegendOpts(pos_top='8%') ) ) line.render_notebook()
-
2018年北上广深全年空气质量情况
v = [] attrs = [] for i in range(4): filename = 'data/' + city_name[i] + '_AQI' + '_2018.csv' df = pd.read_csv(filename) Quality_grade_message = df.groupby(['Quality_grade']) Quality_grade_com = Quality_grade_message['Quality_grade'].agg(['count']) Quality_grade_com.reset_index(inplace = True) Quality_grade_com_list = Quality_grade_com.sort_values('count', ascending=False) Quality_grade_array = np.array(Quality_grade_com_list['Quality_grade']) attrs.append(Quality_grade_array) Quality_grade_count = np.array(Quality_grade_com_list['count']) v.append(Quality_grade_count) months = ['{}'.format(str(i) + '月') for i in range(1, 13)] pie = ( Pie(init_opts=opts.InitOpts(theme=ThemeType.DARK)) .add("北京", [list(z) for z in zip(attrs[0].tolist(), v[0].tolist())], radius=[60,80], center=['20%', '30%'], label_opts=opts.LabelOpts(formatter="北京", position="center", font_size='25') ) .add("上海", [list(z) for z in zip(attrs[1].tolist(), v[1].tolist())], radius=[60,80], center=['55%', '30%'], label_opts=opts.LabelOpts(formatter="上海", position="center", font_size='25') ) .add("广州", [list(z) for z in zip(attrs[2].tolist(), v[2].tolist())], radius=[60,80], center=['20%', '70%'], label_opts=opts.LabelOpts(formatter="广州", position="center", font_size='25') ) .add("深圳", [list(z) for z in zip(attrs[3].tolist(), v[3].tolist())], radius=[60,80], center=['55%', '70%'], label_opts=opts.LabelOpts(formatter="深圳", position="center", font_size='25') ) .set_global_opts(title_opts=opts.TitleOpts(title='2018年北上广深全年空气质量情况'), legend_opts=opts.LegendOpts(type_="scroll", pos_top="20%", pos_left="80%", orient="vertical") ) ) pie.render_notebook()