声明:版权所有,转载请联系作者并注明出处 http://blog.csdn.net/u013719780?viewmode=contents
博主简介:风雪夜归子(Allen),机器学习算法攻城狮,喜爱钻研Meachine Learning的黑科技,对Deep Learning和Artificial Intelligence充满兴趣,经常关注Kaggle数据挖掘竞赛平台,对数据、Machine Learning和Artificial Intelligence有兴趣的童鞋可以一起探讨哦,个人CSDN博客:http://blog.csdn.net/u013719780?viewmode=contents
本文使用pandas对数据进行可视化。
In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
sns.set_context('notebook', font_scale=1.5)
cp = sns.color_palette()
In [2]:
ts = pd.read_csv('data/ts.csv')
ts = ts.assign(dt = pd.to_datetime(ts.dt))
ts.head()
Out[2]:
dt | kind | value | |
---|---|---|---|
0 | 2000-01-01 | A | 1.442521 |
1 | 2000-01-02 | A | 1.981290 |
2 | 2000-01-03 | A | 1.586494 |
3 | 2000-01-04 | A | 1.378969 |
4 | 2000-01-05 | A | -0.277937 |
In [3]:
dfp = ts.pivot(index='dt', columns='kind', values='value')
dfp.head()
Out[3]:
kind | A | B | C | D |
---|---|---|---|---|
dt | ||||
2000-01-01 | 1.442521 | 1.808741 | 0.437415 | 0.096980 |
2000-01-02 | 1.981290 | 2.277020 | 0.706127 | -1.523108 |
2000-01-03 | 1.586494 | 3.474392 | 1.358063 | -3.100735 |
2000-01-04 | 1.378969 | 2.906132 | 0.262223 | -2.660599 |
2000-01-05 | -0.277937 | 3.489553 | 0.796743 | -3.417402 |
In [4]:
fig, ax = plt.subplots(1, 1, figsize=(7.5, 5))
dfp.plot(ax=ax)
ax.set(xlabel='Date',
ylabel='Value',
title='Random Timeseries')
ax.legend(loc=2)
fig.autofmt_xdate()
In [5]:
df = pd.read_csv('data/iris.csv')
df.head()
Out[5]:
petalLength | petalWidth | sepalLength | sepalWidth | species | |
---|---|---|---|---|---|
0 | 1.4 | 0.2 | 5.1 | 3.5 | setosa |
1 | 1.4 | 0.2 | 4.9 | 3.0 | setosa |
2 | 1.3 | 0.2 | 4.7 | 3.2 | setosa |
3 | 1.5 | 0.2 | 4.6 | 3.1 | setosa |
4 | 1.4 | 0.2 | 5.0 | 3.6 | setosa |
In [6]:
fig, ax = plt.subplots(1, 1, figsize=(7.5, 7.5))
for i, s in enumerate(df.species.unique()):
df[df.species == s].plot.scatter(
'petalLength', 'petalWidth',
c=cp[i], label=s, ax=ax
)
ax.set(xlabel='Petal Length',
ylabel='Petal Width',
title='Petal Width v. Length -- by Species')
ax.legend(loc=2)
Out[6]:
<matplotlib.legend.Legend at 0x114bad610>
In [7]:
dfp.plot(subplots=True, layout=(2, 2), figsize=(10, 10),
title='Random Timeseries -- Value over Time')
Out[7]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x117863bd0>, <matplotlib.axes._subplots.AxesSubplot object at 0x117ed3550>], [<matplotlib.axes._subplots.AxesSubplot object at 0x117f57390>, <matplotlib.axes._subplots.AxesSubplot object at 0x117fb8a10>]], dtype=object)
In [8]:
tmp_n = df.shape[0] - df.shape[0]/2
df['random_factor'] = np.random.permutation(['A'] * tmp_n + ['B'] * (df.shape[0] - tmp_n))
df.head()
Out[8]:
petalLength | petalWidth | sepalLength | sepalWidth | species | random_factor | |
---|---|---|---|---|---|---|
0 | 1.4 | 0.2 | 5.1 | 3.5 | setosa | A |
1 | 1.4 | 0.2 | 4.9 | 3.0 | setosa | A |
2 | 1.3 | 0.2 | 4.7 | 3.2 | setosa | B |
3 | 1.5 | 0.2 | 4.6 | 3.1 | setosa | B |
4 | 1.4 | 0.2 | 5.0 | 3.6 | setosa | B |
In [9]:
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
df.boxplot(column='petalWidth', by='species', ax=ax)
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x11713cf50>
In [10]:
sns.set_context('notebook', font_scale=1.25)
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
df.boxplot(column='petalWidth', by=['species', 'random_factor'], ax=ax)
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x114be5650>
In [11]:
sns.set_context('notebook', font_scale=1.5)
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
df.hist(column='petalWidth', by='species', grid=None, ax=ax)
/Applications/anaconda/lib/python2.7/site-packages/pandas/tools/plotting.py:3369: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared "the passed axes is being cleared", UserWarning)
Out[11]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1185dc7d0>, <matplotlib.axes._subplots.AxesSubplot object at 0x119141c50>], [<matplotlib.axes._subplots.AxesSubplot object at 0x11935be50>, <matplotlib.axes._subplots.AxesSubplot object at 0x1193e9310>]], dtype=object)
In [12]:
df = pd.read_csv('data/titanic.csv')
df.head()
Out[12]:
survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
In [13]:
dfg = df.groupby(['survived', 'pclass']).agg({'fare': 'mean'})
dfg
Out[13]:
fare | ||
---|---|---|
survived | pclass | |
0 | 1 | 64.684008 |
2 | 19.412328 | |
3 | 13.669364 | |
1 | 1 | 95.608029 |
2 | 22.055700 | |
3 | 13.694887 |
In [14]:
fig, ax = plt.subplots(1, 1, figsize=(12.5, 7))
dfg.reset_index().\
pivot(index='pclass',
columns='survived',
values='fare').plot.bar(ax=ax)
ax.set(xlabel='Class',
ylabel='Fare',
title='Fare by survival and class')
Out[14]:
[<matplotlib.text.Text at 0x1190fe310>, <matplotlib.text.Text at 0x11715b950>, <matplotlib.text.Text at 0x1194ee110>]