数据可视化是指的通过可视化来表示探索数据,与数据挖掘紧密相关,数据挖掘是指的使用代码来探索数据集的规律和关联。可视化的目的是为了简洁的展示给观看者数据的规律和意义。
本文中使用的可视化模块为matplotlib和pygal
一、准备工作
1.1安装matplotliblinux:sudo apt-get install python3-matplotlib
windows:使用anaconda安装或者安装pip之后用pip install matplotlib
mac OS:pip install –user matplotlib
1.2测试并进行简单绘画1
2
3
4
5import matplotlib.pyplot as plt
squares = [1,4,9,16,25]
plt.plot(squares)
plt.show()
1.3修改标签文字和线条粗细1
2
3
4
5
6
7
8
9
10
11
12
13plt.plot(squares, linewidth=size)#修改线条粗细
plt.title('name', fontsize=n)#修改图表名字
#给坐标轴加上标签
plt.xlabel('name', fontsize=n)
plt.ylabel('name', fontsize=n)
#设置刻度标记的大小
plt.tick_params(axis='both', labelsize=14)
#plot函数可以同时传入自变量和因变量
plt.plot(x, y, linewidth=14)
#图标区域涂色
#利用plt.fill_between(x,y1,y2,facecolor='red',alpha=0.1)其中alpha设置透明度0为透明,1为不透明
1.4散点图1
2
3
4#scatter_squares.py
import matplotlib.pyplot as plt
plt.scatter(2, 4, s=200)#s可以设置散点的大小
plt.show()
1
2
3
4
5
6
7
8#绘制系列散点
import matplotlib.pyplot as plt
x=[1,2,3,4,5,6,7]
y=[3,4,6,8,3,4,5]
plt.scatter(x, y, c='yellow', edgecolor='none', s=100)#去除数据点的轮廓
plt.axis([0, 8, 0, 11])#设置坐标轴范围
plt.show()
1
2
3
4
5
6
7
8
9#绘制系列散点,并使用颜色映射colormap
import matplotlib.pyplot as plt
x=[1,2,3,4,5,6,7]
y=[3,4,6,8,3,4,5]
plt.scatter(x, y, c=y, cmap=plt.cm.Blues, edgecolor='none', s=100)#colormap颜色映射,看出数据的变化趋势
plt.axis([0, 8, 0, 11])#设置坐标轴范围
plt.show()
plt.savefig('filename', bbox_inches='tight')#自动保存图表,第二参数是裁掉多余空白区域
1.5随机漫步
每次行走都是随机的,意味着方向不一定,行走步数不一定。
1.5.1创建RandWalk类1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47from random import choice
import matplotlib.pyplot as plt
class ():
'''随机生成漫步数据的类'''
def __init__(self, num_points=5000):
'''初始化属性'''
self.num_points = num_points
self.x_values = [0]#随机漫步从(0,0)出发
self.y_values = [0]
def fill_walk(self):
'''计算随机漫步的所有点'''
#不断漫步,直到达到指定的长度
while len(self.x_values) < self.num_points:
#决定前进的方向和前进距离
x_direction = choice([1,-1])
x_distance = choice([0, 1, 2, 3, 4])
x_step = x_direction * x_distance
y_direction = choice([1,-1])
y_distance = choice([0, 1, 2, 3, 4])
y_step = y_direction * y_distance
#拒绝原地踏步
if x_step == 0 and y_step == 0:
continue
#计算下一个点的xy的值
next_x = self.x_values[-1] + x_step
next_y = self.y_values[-1] + y_step
#更新坐标值
self.x_values.append(next_x)
self.y_values.append(next_y)
#开始绘制
plt.figure(figsize=(10,6)) #可以传入的参数包括dpi像素,大小,背景色
rw = RandWalk()
rw.fill_walk()
plt.scatter(rw.x_values, rw.y_values, c=list(range(rw.num_points)), cmap=plt.cm.Blues, edgecolor='none', s=1)
#突出起点和终点
plt.scatter(0, 0, edgecolors='none', c='red', s=40)
plt.scatter(rw.x_values[-1], rw.y_values[-1], edgecolors='none', c='yellow', s=40)
#隐藏坐标轴
plt.axes().get_xaxis().set_visible(False)
plt.axes().get_yaxis().set_visible(False)
plt.show()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41from random import choice
class ():
'''随机生成漫步数据的类'''
def __init__(self, num_points=5000):
self.num_points = num_points
self.x_values = [0]#随机漫步从(0,0)出发
self.y_values = [0]
def fill_walk(self):
'''计算随机漫步的所有点'''
#不断漫步,直到达到指定的长度
while len(self.x_values) < self.num_points:
#决定前进的方向和前进距离
x_direction = choice([1,-1])
x_distance = choice([0, 1, 2, 3, 4])
x_step = x_direction * x_distance
y_direction = choice([1,-1])
y_distance = choice([0, 1, 2, 3, 4])
y_step = y_direction * y_distance
#拒绝原地踏步
if x_step == 0 and y_step == 0:
continue
#计算下一个点的xy的值
next_x = self.x_values[-1] + x_step
next_y = self.y_values[-1] +y_step
#更新坐标值
self.x_values.append(next_x)
self.y_values.append(next_y)
#开始绘制
plt.figure(figsize=(10,6))#可以传入的参数包括dpi像素,大小,背景色
rw = RandWalk()
rw.fill_walk()
plt.plot(rw.x_values, rw.y_values, linewidth=1)
#突出起点和终点
#plt.plot(0, 0, edgecolors='none', c='red', s=40)
#plt.plot(rw.x_values[-1], rw.y_values[-1], edgecolors='none', c='yellow', linewidth=4)
plt.show()
1.6Pygal1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#掷骰子的类
from random import randint
import pygal
class Die():
'''表示一个骰子的类'''
def __init__(self, num_sides):
self.num_sides = num_sides
def roll(self):
return randint(1,self.num_sides)
#测试上面的类是否可行
sides = 6
die = Die(sides)
results = []
for roll_num in range(1000):
result = die.roll()
results.append(result)
#print(results)
num_values = []
#分析结果,做统计
for value in range(1,1+sides):
values = results.count(value)
num_values.append(values)
#print(num_values)
#绘制直方图,将数据可视化
hist = pygal.Bar()
hist.title= 'Results of rolling D6 1000 times'
hist.x_labels = ['1','2','3','4','5','6']
hist.x_title = 'num_values'
hist.y_title = 'Frequency of Result'
#add函数
hist.add('D6',num_values)
hist.render_to_file('die_visual.svg')
二、下载数据后的数据的处理
下载后的数据文件一般分为两种格式:
csv格式 用逗号隔开
json格式 列表中的元素是字典类型[{}, {}, {}] json.load() json.dump()
1
2
3
4
5
6
7
8
9import csv
filename = ''
with open(filename) as f:
reader = csv.reader(f)
header_csv = next(reader)
#获取文件头的数据和索引
for index,value = enumerate(header_csv):
print(index+':'+value)
File "", line 8
for index,value = enumerate(header_csv):
^
SyntaxError: invalid syntax
三、使用API1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30#导入requests模块,使用pygal来可视化数据
import requests
import pygal
from pygal.style import LightColorizedStyle as LCS, LightenStyle as LS
url = 'https://api.github.com/search/repositories?q=language:python&sort=stars'
r = requests.get(url)
print('status code:', r.status_code)
reponse_dict = r.json()
print(reponse_dict.keys())#响应字典
#研究第一个仓库
repo_dicts = reponse_dict['items']
repo_dict = repo_dicts[0]
#第一个仓库包含的信息
print(len(repo_dict.keys()))
for key in sorted(repo_dict.keys()):
print(key)
names,stars = [],[]
for repo_dict in repo_dicts:
names.append(repo_dict['name'])
stars.append(repo_dict['stargazers_count'])
my_style = LS('#333366',base_style=LCS)
chart = pygal.Bar(style=my_style, x_label_rotation=45, show_legend=False)
chart.title = 'Most-Starred Python Projects on Github'
chart.x_labels = names
chart.add('',stars)
chart.render_to_file('python_repos.svg')
status code: 200
dict_keys(['total_count', 'incomplete_results', 'items'])
73
archive_url
archived
assignees_url
blobs_url
branches_url
clone_url
collaborators_url
comments_url
commits_url
compare_url
contents_url
contributors_url
created_at
default_branch
deployments_url
description
downloads_url
events_url
fork
forks
forks_count
forks_url
full_name
git_commits_url
git_refs_url
git_tags_url
git_url
has_downloads
has_issues
has_pages
has_projects
has_wiki
homepage
hooks_url
html_url
id
issue_comment_url
issue_events_url
issues_url
keys_url
labels_url
language
languages_url
license
merges_url
milestones_url
mirror_url
name
node_id
notifications_url
open_issues
open_issues_count
owner
private
pulls_url
pushed_at
releases_url
score
size
ssh_url
stargazers_count
stargazers_url
statuses_url
subscribers_url
subscription_url
svn_url
tags_url
teams_url
trees_url
updated_at
url
watchers
watchers_count
下面还有hacker news示例,就不再总结了,主要包括通过api请求资源,将响应按照json格式进行处理,最后进行信息提取展示。还是要熟练json的格式