数据科学和人工智能技术笔记二十、数据可视化

最新推荐文章于 2020-09-16 13:27:26 发布

weixin_34218579

最新推荐文章于 2020-09-16 13:27:26 发布

阅读量245

点赞数

文章标签：人工智能 python 开发工具

　　二十、数据可视化
　　
　　作者：Chris Albon
　　
　　译者：飞龙
　　
　　协议：CC BY-NC-SA 4.0
　　
　　MatPlotLib 中的双向条形图
　　
　　%matplotlib inline
　　
　　import pandas as pd
　　
　　import matplotlib.pyplot as plt
　　
　　import numpy as np
　　
　　# 创建数据帧
　　
　　raw_data = {'first_name'www.ysyl157.com : ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
　　
　　'pre_score': [4, 24, 31, 2, 3],
　　
　　first_name pre_score mid_score post_score
　　
　　0 Jason 4 25 5
　　
　　1 Molly 24 94 43
　　
　　2 Tina 31 57 23
　　
　　3 Jake 2 62 23
　　
　　4 Amy 3 70 51
　　
　　# 输入数据，特别是第二和
　　
　　# 第三行，跳过第一列
　　
　　x1 = df.ix[1, 1:]
　　
　　x2 = df.ix[2, 1:]
　　
　　# 创建条形标签
　　
　　bar_labels = ['Pre Score'www.dasheng178.com/, 'Mid Score', 'Post Score']
　　
　　# 创建图形
　　
　　fig = plt.figure(figsize=(8,6))
　　
　　# 设置 y 的位置
　　
　　y_pos = np.arange(len(x1))
　　
　　y_pos = [x for x in y_pos]
　　
　　plt.yticks(y_pos, bar_labels, fontsize=10)
　　
　　# 在 y_pos 的位置上创建水平条形
　　
　　plt.barh(y_pos,
　　
　　# 使用数据 x1
　　
　　x1,
　　
　　# 中心对齐
　　
　　align='center',
　　
　　# 透明度为 0.4
　　
　　alpha=0.4,
　　
　　# 颜色为绿色
　　
　　color='#263F13')
　　
　　# 在 y_pos 的位置上创建水平条形
　　
　　plt.barh(y_pos,
　　
　　# 使用数据 -x2
　　
　　-x2,
　　
　　# 中心对齐
　　
　　align='center',
　　
　　# 透明度为 0.4
　　
　　alpha=0.4,
　　
　　# 颜色为绿色
　　
　　color='#77A61D')
　　
　　# 注解和标签
　　
　　plt.xlabel('Tina\'s Score: Light Green. Molly\'s Score: Dark Green')
　　
　　t = plt.title('Comparison of Molly and Tina\'s Score')
　　
　　plt.ylim([-1,len(x1)+0.1])
　　
　　plt.xlim([-max(x2)-10, max(x1)+10])
　　
　　first_name pre_score mid_score post_score
　　
　　0 Jason 4 25 5
　　
　　1 Molly 24 94 43
　　
　　2 Tina 31 57 23
　　
　　3 Jake 2 62 23
　　
　　4 Amy 3 70 51
　　
　　# 为每个变量创建得分均值的列表
　　
　　mean_values = [df['pre_score'].mean(), df['mid_score'].mean(), df['post_score'].mean()]
　　
　　# 创建变动列表，设为得分上下 .25
　　
　　variance = [df['pre_score'].mean() * 0.25, df['pre_score'].mean() * 0.25, df['pre_score'].mean() * 0.25]
　　
　　# 设置条形标签
　　
　　bar_labels = ['Pre Score', 'Mid Score', 'Post Score']
　　
　　# 创建条形的 x 位置
　　
　　x_pos = list(range(len(bar_labels)))
　　
　　# 在 x 位置上创建条形图
　　
　　plt.bar(x_pos,
　　
　　# 使用 mean_values 中的数据
　　
　　mean_values,
　　
　　# y-error 直线设置为变动
　　
　　yerr=variance,
　　
　　# 中心对齐
　　
　　align='center',
　　
　　# 颜色
　　
　　color='#FFC222',
　　
　　# 透明度为 0.5
　　
　　alpha=0.5)
　　
　　Seaborn 中的调色板
　　
　　import pandas as pd
　　
　　%matplotlib inline
　　
　　import matplotlib.pyplot as plt
　　
　　import seaborn as sns
　　
　　# 创建数据帧
　　
　　data = {'date': ['2014-05-01 18:47:05.069722', '2014-05-01 18:47:05.119994', '2014-05-02 18:47:05.178768', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.280592', '2014-05-03 18:47:05.332662', '2014-05-03 18:47:05.385109', '2014-05-04 18:47:05.436523', '2014-05-04 18:47:05.486877'],
　　
　　'deaths_regiment_1': [34, 43, 14, 15, 15, 14, 31, 25, 62, 41],
　　
　　'deaths_regiment_2': [52, 66, 78, 15, 15, 5, 25, 25, 86, 1],
　　
　　'deaths_regiment_3': [13, 73, 82, 58, 52, 87, 26, 5, 56, 75],
　　
　　'deaths_regiment_4': [44, 75, 26, 15, 15, 14, 54, 25, 24, 72],
　　
　　'deaths_regiment_5': [25, 24, 25, 15, 57, 68, 21, 27, 62, 5],
　　
　　'deaths_regiment_6': [84, 84, 26, 15, 15, 14, 26, 25, 62, 24],
　　
　　'deaths_regiment_7': [46, 57, 26, 15, 15, 14, 26, 25, 62, 41]}
　　
　　df = pd.DataFrame(data,www.michenggw.com columns = ['date', 'battle_deaths', 'deaths_regiment_1', 'deaths_regiment_2',
　　
　　'deaths_regiment_3', 'deaths_regiment_4',www.mhylpt.com/ 'deaths_regiment_5',
　　
　　'deaths_regiment_6', 'deaths_regiment_7'])
　　
　　df = df.set_index(df.date)
　　
　　sns.palplot(sns.color_palette("muted", 10))
　　
　　1
　　
　　png
　　
　　sns.palplot(sns.color_palette("bright", 10))
　　
　　1
　　
　　png
　　
　　从 Pandas 数据帧生成 MatPlotLib 散点图
　　
　　%matplotlib inline
　　
　　import pandas as pd
　　
　　import matplotlib.pyplot as plt
　　
　　import numpy as np
　　
　　raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
　　
　　'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
　　
　　'female': [0, 1, 1, 0, 1],
　　
　　'age': [42, 52, 36, 24, 73],
　　
　　'preTestScore': [4, 24, 31, 2, 3],
　　
　　'postTestScore': [25, 94, 57, 62, 70]}
　　
　　df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'female', 'preTestScore', 'postTestScore'])
　　
　　first_name last_name age female preTestScore postTestScore
　　
　　0 Jason Miller 42 0 4 25
　　
　　1 Molly Jacobson 52 1 24 94
　　
　　2 Tina Ali 36 1 31 57
　　
　　3 Jake Milner 24 0 2 62
　　
　　4 Amy Cooze 73 1 3 70
　　
　　# preTestScore 和 postTestScore 的散点图
　　
　　# 每个点的大小取决于年龄
　　
　　plt.scatter(df.preTestScore,www.thd178.com df.postTestScore
　　
　　, s=df.age)
　　
　　# <matplotlib.collections.PathCollection at 0x10ca42b00>
　　
　　# preTestScore 和 postTestScore 的散点图
　　
　　# 大小为 300，颜色取决于性别
　　
　　plt.scatter(df.preTestScore, df.postTestScore, s=300, c=df.female)
　　
　　# <matplotlib.collections.PathCollection at 0x10cb90a90>
　　
　　Matplotlib 的简单示例
　　
　　# 让 Jupyter 加载 matplotlib
　　
　　# 并内联创建所有绘图（也就是在页面上）
　　
　　%matplotlib inline
　　
　　import matplotlib.pyplot as pyplot
　　
　　pyplot.plot([1.6, 2.7])
　　
　　# [<matplotlib.lines.Line2D at 0x10c4e7978>]
　　
　　MatPlotLib 中的饼图
　　
　　%matplotlib inline
　　
　　import pandas as pd
　　
　　import matplotlib.pyplot as plt
　　
　　raw_data = {'officer_name'www.qinlinyu.cn: ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
　　
　　'jan_arrests': [4, 24, 31, 2, 3],
　　
　　'feb_arrests': [25, 94, 57, 62, 70],
　　
　　'march_arrests': [5, 43, 23, 23, 51]}
　　
　　df = pd.DataFrame(raw_data, columns = ['officer_name', 'jan_arrests', 'feb_arrests', 'march_arrests'])
　　
　　officer_name jan_arrests feb_arrests march_arrests
　　
　　0 Jason 4 25 5
　　
　　1 Molly 24 94 43
　　
　　2 Tina 31 57 23
　　
　　3 Jake 2 62 23
　　
　　4 Amy 3 70 51
　　
　　# 创建一列，其中包含每个官员的总逮捕数
　　
　　df['total_arrests'] = df[www.micheng178.com'jan_arrests'] + df['feb_arrests'] + df['march_arrests']
　　
　　officer_name jan_arrests feb_arrests march_arrests total_arrests
　　
　　0 Jason 4 25 5 34
　　
　　1 Molly 24 94 43 161
　　
　　2 Tina 31 57 23 111
　　
　　3 Jake 2 62 23 87
　　
　　4 Amy 3 70 51 124
　　
　　# （从 iWantHue）创建一列颜色
　　
　　colors = ["#E13F29", "#D69A80", "#D63B59", "#AE5552", "#CB5C3B", "#EB8076", "#96624E"]
　　
　　# 创建饼图
　　
　　plt.pie(
　　
　　# 使用数据 total_arrests
　　
　　df['total_arrests'],
　　
　　# 标签为官员名称
　　
　　labels=df['officer_name'],
　　
　　# 没有阴影
　　
　　shadow=False,
　　
　　# 颜色
　　
　　colors=colors,
　　
　　# 将一块扇形移出去
　　
　　explode=(0, 0, 0, 0, 0.15),
　　
　　# 起始角度为 90 度
　　
　　startangle=90,
　　
　　# 将百分比列为分数
　　
　　autopct='%1.1f%%',
　　
　　)
　　
　　# 使饼状图为正圆
　　
　　plt.axis('equal')
　　
　　# 查看绘图
　　
　　MatPlotLib 中的散点图
　　
　　%matplotlib inline
　　
　　import pandas as pd
　　
　　import matplotlib.pyplot as plt
　　
　　import numpy as np
　　
　　# 展示 ipython 的最大行数
　　
　　pd.set_option('display.max_row', 1000)
　　
　　# 将 ipython 的最大列宽设为 50
　　
　　pd.set_option('display.max_columns', 50)
　　
　　df = pd.read_csv('https://raw.githubusercontent.com/chrisalbon/war_of_the_five_kings_dataset/master/5kings_battles_v1.csv')
　　
　　df.head()
　　
　　name year battle_number attacker_king defender_king attacker_1 attacker_2 attacker_3 attacker_4 defender_1 defender_2 defender_3 defender_4 attacker_outcome battle_type major_death major_capture attacker_size defender_size attacker_commander defender_commander summer location region note
　　
　　0 Battle of the Golden Tooth 298 1 Joffrey/Tommen Baratheon Robb Stark Lannister NaN NaN NaN Tully NaN NaN NaN win pitched battle 1.0 0.0 15000.0 4000.0 Jaime Lannister Clement Piper, Vance 1.0 Golden Tooth The Westerlands NaN
　　
　　1 Battle at the Mummer’s Ford 298 2 Joffrey/Tommen Baratheon Robb Stark Lannister NaN NaN NaN Baratheon NaN NaN NaN win ambush 1.0 0.0 NaN 120.0 Gregor Clegane Beric Dondarrion 1.0 Mummer’s Ford The Riverlands NaN
　　
　　2 Battle of Riverrun 298 3 Joffrey/Tommen Baratheon Robb Stark Lannister NaN NaN NaN Tully NaN NaN NaN win pitched battle 0.0 1.0 15000.0 10000.0 Jaime Lannister, Andros Brax Edmure Tully, Tytos Blackwood 1.0 Riverrun The Riverlands NaN
　　
　　3 Battle of the Green Fork 298 4 Robb Stark Joffrey/Tommen Baratheon Stark NaN NaN NaN Lannister NaN NaN NaN loss pitched battle 1.0 1.0 18000.0 20000.0 Roose Bolton, Wylis Manderly, Medger Cerwyn, H… Tywin Lannister, Gregor Clegane, Kevan Lannist… 1.0 Green Fork The Riverlands NaN
　　
　　4 Battle of the Whispering Wood 298 5 Robb Stark Joffrey/Tommen Baratheon Stark Tully NaN NaN Lannister NaN NaN NaN win ambush 1.0 1.0 1875.0 6000.0 Robb Stark, Brynden Tully Jaime Lannister 1.0 Whispering Wood The Riverlands NaN
　　
　　# 创建图形
　　
　　plt.figure(figsize=(10,8))
　　
　　# 创建散点图
　　
　　# 298 年的攻击方大小为 x 轴
　　
　　plt.scatter(df['attacker_size'][df['year'] == 298],
　　
　　# 298 年的防守方大小为 y 轴
　　
　　df['defender_size'][df['year'] == 298],
　　
　　# 标记
　　
　　marker='x',
　　
　　# 颜色
　　
　　color='b',
　　
　　# 透明度
　　
　　alpha=0.7,
　　
　　# 大小
　　
　　s = 124,
　　
　　# 标签
　　
　　label='Year 298')
　　
　　# 299 年的攻击方大小为 x 轴
　　
　　plt.scatter(df['attacker_size'][df['year'] == 299],
　　
　　# 299 年的防守方大小为 y 轴
　　
　　df['defender_size'][df['year'] == 299],
　　
　　# 标记
　　
　　marker='o',
　　
　　# 颜色
　　
　　color='r',
　　
　　# 透明度
　　
　　alpha=0.7,
　　
　　# 大小
　　
　　s = 124,
　　
　　# 标签
　　
　　label='Year 299')
　　
　　# 300 年的攻击方大小为 x 轴
　　
　　plt.scatter(df['attacker_size'][df['year'] == 300],
　　
　　# 300 年的防守方大小为 x 轴
　　
　　df['defender_size'][df['year'] == 300],
　　
　　# 标记
　　
　　marker='^',
　　
　　# 颜色
　　
　　color='g',
　　
　　# 透明度
　　
　　alpha=0.7,
　　
　　# 大小
　　
　　s = 124,
　　
　　# 标签
　　
　　label='Year 300')
　　
　　# 标题
　　
　　plt.title('Battles Of The War Of The Five Kings')
　　
　　# y 标签
　　
　　plt.ylabel('Defender Size')
　　
　　# x 标签
　　
　　plt.xlabel('Attacker Size')
　　
　　# 图例
　　
　　plt.legend(loc='upper right')
　　
　　# 设置图形边界
　　
　　plt.xlim([min(df['attacker_size'])-1000, max(df['attacker_size'])+1000])
　　
　　plt.ylim([min(df['defender_size'])-1000, max(df['defender_size'])+1000])
　　
　　MatPlotLib 中的栈式百分比条形图
　　
　　%matplotlib inline
　　
　　import pandas as pd
　　
　　import matplotlib.pyplot as plt
　　
　　raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
　　
　　'pre_score': [4, 24, 31, 2, 3],
　　
　　'mid_score': [25, 94, 57, 62, 70],
　　
　　'post_score': [5, 43, 23, 23, 51]}
　　
　　df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score'])
　　
　　first_name pre_score mid_score post_score
　　
　　0 Jason 4 25 5
　　
　　1 Molly 24 94 43
　　
　　2 Tina 31 57 23
　　
　　3 Jake 2 62 23
　　
　　4 Amy 3 70 51
　　
　　# 创建带有一个子图的图形
　　
　　f, ax = plt.subplots(1, figsize=(10,5))
　　
　　# 将条宽设为 1
　　
　　bar_width = 1
　　
　　# 条形左边界的位置
　　
　　bar_l = [i for i in range(len(df['pre_score']))]
　　
　　# x 轴刻度的位置（条形的中心是条形标签）
　　
　　tick_pos = [i+(bar_width/2) for i in bar_l]
　　
　　# 创建每个参与者的总得分
　　
　　totals = [i+j+k for i,j,k in zip(df['pre_score'], df['mid_score'], df['post_score'])]
　　
　　# 创建每个参与者的 pre_score 和总得分的百分比
　　
　　pre_rel = [i / j * 100 for i,j in zip(df['pre_score'], totals)]
　　
　　# 创建每个参与者的 mid_score 和总得分的百分比
　　
　　mid_rel = [i / j * 100 for i,j in zip(df['mid_score'], totals)]
　　
　　# 创建每个参与者的 post_score 和总得分的百分比
　　
　　post_rel = [i / j * 100 for i,j in zip(df['post_score'], totals)]
　　
　　# 在位置 bar_1 创建条形图
　　
　　ax.bar(bar_l,
　　
　　# 使用数据 pre_rel
　　
　　pre_rel,
　　
　　# 标签
　　
　　label='Pre Score',
　　
　　# 透明度
　　
　　alpha=0.9,
　　
　　# 颜色
　　
　　color='#019600',
　　
　　# 条形宽度
　　
　　width=bar_width,
　　
　　# 边框颜色
　　
　　edgecolor='white'
　　
　　)
　　
　　# 在位置 bar_1 创建条形图
　　
　　ax.bar(bar_l,
　　
　　# 使用数据 mid_rel
　　
　　mid_rel,
　　
　　# 底部为 pre_rel
　　
　　bottom=pre_rel,
　　
　　# 标签
　　
　　label='Mid Score',
　　
　　# 透明度
　　
　　alpha=0.9,
　　
　　# 颜色
　　
　　color='#3C5F5A',
　　
　　# 条形宽度
　　
　　width=bar_width,
　　
　　# 边框颜色
　　
　　edgecolor='white'
　　
　　)
　　
　　# Create a bar chart in position bar_1
　　
　　ax.bar(bar_l,
　　
　　# 使用数据 post_rel
　　
　　post_rel,
　　
　　# 底部为 pre_rel 和 mid_rel
　　
　　bottom=[i+j for i,j in zip(pre_rel, mid_rel)],
　　
　　# 标签
　　
　　label='Post Score',
　　
　　# 透明度
　　
　　alpha=0.9,
　　
　　# 颜色
　　
　　color='#219AD8',
　　
　　# 条形宽度
　　
　　width=bar_width,
　　
　　# 边框颜色
　　
　　edgecolor='white'
　　
　　)
　　
　　# 将刻度设为 first_name
　　
　　plt.xticks(tick_pos, df['first_name'])
　　
　　ax.set_ylabel("Percentage")
　　
　　ax.set_xlabel("")
　　
　　# 设置图形边界
　　
　　plt.xlim([min(tick_pos)-bar_width, max(tick_pos)+bar_width])
　　
　　plt.ylim(-10, 110)
　　
　　# 旋转轴标签
　　
　　plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
　　
　　# 展示绘图
　　
　　plt.show()