文章目录
Pandas的基础
pandas概念
Pandas是python里分析结构化数据的工具集
基础是numpy:高性能矩阵运算
图形库matplotlib:提供数据可视化
>>> import pandas as pd
>>> import numpy as np
>>> s = pd.Series([1,3,5,np.NaN])
>>> s = pd.Series([1,3,5,np.NaN,8,4])
>>> s
0 1.0
1 3.0
2 5.0
3 NaN
4 8.0
5 4.0
dtype: float64
>>>
>>> dates = pd.date_range('20160301',periods=6)
>>> dates
DatetimeIndex(['2016-03-01', '2016-03-02', '2016-03-03', '2016-03-04',
'2016-03-05', '2016-03-06'],
dtype='datetime64[ns]', freq='D')
>>> data = pd.DataFrame(np.random.randn(6,4),index = dates,columns = list('ABCD'))
>>> data
A B C D
2016-03-01 -0.427646 0.130792 -0.839618 -0.615116
2016-03-02 0.034087 1.309524 -0.082069 0.745936
2016-03-03 -0.634695 1.167096 -0.390004 0.957145
2016-03-04 -0.103028 0.914146 1.395773 -1.256836
2016-03-05 -0.643205 -0.133583 -0.534991 -0.341684
2016-03-06 0.440786 -1.884992 0.325900 1.545900
>>> data.shape
(6, 4)
>>> data.values
array([[-0.42764635, 0.13079174, -0.83961782, -0.61511628],
[ 0.03408716, 1.3095237 , -0.08206941, 0.74593586],
[-0.63469473, 1.16709615, -0.39000379, 0.9571454 ],
[-0.10302765, 0.91414639, 1.39577326, -1.25683557],
[-0.64320542, -0.13358293, -0.53499056, -0.34168362],
[ 0.44078573, -1.88499219, 0.32589968, 1.54590003]])
运用字典
>>> d = {
'A':1,'B':pd.Timestamp('20130301'),'C':range(4),'D':np.arange(4)}
>>> d
{
'A': 1, 'B': Timestamp('2013-03-01 00:00:00'), 'C': range(0, 4), 'D': array([0, 1, 2, 3])}
查看数据类型
>>> df.dtypes
A int64
B datetime64[ns]
C int64
D int32
dtype: object
查看列
>>> df.A
0 1
1 1
2 1
3 1
Name: A, dtype: int64
>>> df.B
0 2013-03-01
1 2013-03-01
2 2013-03-01
3 2013-03-01
Name: B, dtype: datetime64[ns]
>>> type(df.B)
<class 'pandas.core.series.Series'>
查看数据
>>> data
A B C D
2016-03-01 -0.427646 0.130792 -0.839618 -0.615116
2016-03-02 0.034087 1.309524 -0.082069 0.745936
2016-03-03 -0.634695 1.167096 -0.390004 0.957145
2016-03-04 -0.103028 0.914146 1.395773 -1.256836
2016-03-05 -0.643205 -0.133583 -0.534991 -0.341684
2016-03-06 0.440786 -1.884992 0.325900 1.545900
>>> data.head()#查看前五行
A B C D
2016-03-01 -0.427646 0.130792 -0.839618 -0.615116
2016-03-02 0.034087 1.309524 -0.082069 0.745936
2016-03-03 -0.634695 1.167096 -0.390004 0.957145
2016-03-04 -0.103028 0.914146 1.395773 -1.256836
2016-03-05 -0.643205 -0.133583 -0.534991 -0.341684
>>> data.head(2)
A B C D
2016-03-01 -0.427646 0.130792 -0.839618 -0.615116
2016-03-02 0.034087 1.309524 -0.082069 0.745936
>>> data.tail()查看后五行
A B C D
2016-03-02 0.034087 1.309524 -0.082069 0.745936
2016-03-03 -0.634695 1.167096 -0.390004 0.957145
2016-03-04 -0.103028 0.914146 1.395773 -1.256836
2016-03-05 -0.643205 -0.133583 -0.534991 -0.341684
2016-03-06 0.440786 -1.884992 0.325900 1.545900
>>> data.tail(3)
A B C D
2016-03-04 -0.103028 0.914146 1.395773 -1.256836
2016-03-05 -0.643205 -0.133583 -0.534991 -0.341684
2016-03-06 0.440786 -1.884992 0.325900 1.545900
行标签、列标签、值
>>> data.index
DatetimeIndex(['2016-03-01', '2016-03-02', '2016-03-03', '2016-03-04',
'2016-03-05', '2016-03-06'],
dtype='datetime64[ns]', freq='D')
>>> data.columns
Index(['A', 'B', 'C', 'D'], dtype='object')
>>> data.values
array([[-0.42764635, 0.13079174, -0.83961782, -0.61511628],
[ 0.03408716, 1.3095237 , -0.08206941, 0.74593586],
[-0.63469473, 1.16709615, -0.39000379, 0.9571454 ],
[-0.10302765, 0.91414639, 1.39577326, -1.25683557],
[-0.64320542, -0.13358293, -0.53499056, -0.34168362],
[ 0.44078573, -1.88499219, 0.32589968, 1.54590003]])
基本统计数据,数据转置
>>> data.describe()
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean -0.222284 0.250497 -0.020835 0.172558
std 0.426390 1.193674 0.800225 1.073168
min -0.643205 -1.884992 -0.839618 -1.256836
25% -0.582933 -0.067489 -0.498744 -0.546758
50% -0.265337 0.522469 -0.236037 0.202126
75% -0.000192 1.103859 0.223907 0.904343
max 0.440786 1.309524 1.395773 1.545900
>>> data.T
2016-03-01 2016-03-02 2016-03-03 2016-03-04 2016-03-05 2016-03-06
A -0.427646 0.034087 -0.634695 -0.103028 -0.643205 0.440786
B 0.130792 1.309524 1.167096 0.914146 -0.133583 -1.884992
C -0.839618 -0.082069 -0.390004 1.395773 -0.534991 0.325900
D -0.615116 0.745936 0.957145 -1.256836 -0.341684 1.545900
#转置前后对比
>>> data.T.shape
(4, 6)
>>> data.shape
(6, 4)
排序
>>> data.sort_index(axis=1) #行排序
A B C D
2016-03-01 -0.427646 0.130792 -0.839618 -0.615116
2016-03-02 0.034087 1.309524 -0.082069 0.745936
2016-03-03 -0.634695 1.167096 -0.390004 0.957145
2016-03-04 -0.103028 0.914146 1.395773 -1.256836
2016-03-05 -0.643205 -0.133583 -0.534991 -0.341684
2016-03-06 0.440786 -1.884992 0.325900 1.545900
>>> data.sort_index(axis = 1,ascending=False)
D C B A
2016-03-01 -0.615116 -0.839618 0.130792 -0.427646
2016-03-02 0.745936 -0.082069 1.309524 0.034087
2016-03-03 0.957145 -0.390004 1.167096 -0.634695
2016-03-04 -1.256836 1.395773 0.914146 -0.103028
2016-03-05 -0.341684 -0.534991 -0.133583 -0.643205
2016-03-06 1.545900 0.325900 -1.884992 0.440786
>>> data.sort_index(axis=0) #列排序
A B C D
2016-03-01 -0.427646 0.130792 -0.839618 -0.615116
2016-03-02 0.034087 1.309524 -0.082069 0.745936
2016-03-03 -0.634695 1.167096 -0.390004 0.957145
2016-03-04 -0.103028 0.914146 1.395773 -1.256836
2016-03-05 -0.643205 -0.133583 -0.534991 -0.341684
2016-03-06 0.440786 -1.884992 0.325900 1.545900
>>> data.sort_index(axis = 0,ascending=False)
A B C D
2016-03-06 0.440786 -1.884992 0.325900 1.545900
2016-03-05 -0.643205 -0.133583 -0.534991 -0.341684
2016-03-04 -0.103028 0.914146 1.395773 -1.256836
2016-03-03 -0.634695 1.167096 -0.390004 0.957145
2016-03-02 0.034087 1.309524 -0.082069 0.745936
2016-03-01 -0.427646 0.130792 -0.839618 -0.615116
>>> data.sort_values(by='A')#值排列
A B C D
2016-03-05 -0.643205 -0.133583 -0.534991 -0.341684
2016-03-03 -0.634695 1.167096 -0.390004 0.957145
2016-03-01 -0.427646 0.130792 -0.839618 -0.615116
2016-03-04 -0.103028 0.914146 1.395773 -1.256836
2016-03-02 0.034087 1.309524 -0.082069 0.745936
2016-03-06 0.440786 -1.884992 0.325900 1.545900
>>> data['A'] #查看行属性值
2016-03-01 -0.427646
2016-03-02 0.034087
2016-03-03 -0.634695
2016-03-04 -0.103028
2016-03-05 -0.643205
2016-03-06 0.440786
Freq: D, Name: A, dtype: float64
>>> data.A
2016-03-01 -0.427646
2016-03-02 0.034087
2016-03-03 -0.634695
2016-03-04 -0.103028
2016-03-05 -0.643205
2016-03-06 0.440786
Freq: D, Name: A, dtype: float64
数据索引(Loc函数)
>>> data[2:4]
A B C D
2016-03-03 -0.634695 1.167096 -0.390004 0.957145
2016-03-