Python库（2）—— Pandas 数据分析处理库

zyw2002

已于 2023-08-04 12:01:22 修改

阅读量857

点赞数 1

分类专栏： python基础文章标签： python pandas

于 2021-09-01 20:03:40 首次发布

本文链接：https://blog.csdn.net/zyw2002/article/details/120046087

版权

python基础专栏收录该内容

16 篇文章 8 订阅

订阅专栏

Pandas —— 数据分析处理库

安装Pandas: pip install pandas

import pandas as pd
pd.show_versions() # 显示当前版本信息

读取数据

读数据

pd.read_csv() : 读取csv类型数据

df = pd.read_csv('./data/titanic.csv')

df.head(n) : 显示前n条数据

df.head(6) # head()可以读取前几条数据,指定前几条都可以

在这里插入图片描述
df.info() : 返回数据的总体信息

df.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
'''

数据项的索引
df.index : 数据项
df.columns ：指标

df.index # RangeIndex(start=0, stop=891, step=1)
df.columns
'''
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
'''

df.dtypes : 数据类型

df.dtypes
'''
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
'''

索引

索引单个column

age = df['Age']
age[:5]
'''
age = df['Age']
age[:5]
age = df['Age']
age[:5]
0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64
'''

索引多个column

df[['Age','Fare']][:5]

set_index: 默认index是0，1，2…, 也可以用set_index来指定某一个指标为index

df = df.set_index('Name')
df['Age'][:5]
'''
Name
Braund, Mr. Owen Harris                                22.0
Cumings, Mrs. John Bradley (Florence Briggs Thayer)    38.0
Heikkinen, Miss. Laina                                 26.0
Futrelle, Mrs. Jacques Heath (Lily May Peel)           35.0
Allen, Mr. William Henry                               35.0
Name: Age, dtype: float64
'''

age = df['Age']
age['Allen, Mr. William Henry'] #35.0

age=age+10
age[:5]
'''
Name
Braund, Mr. Owen Harris                                32.0
Cumings, Mrs. John Bradley (Florence Briggs Thayer)    48.0
Heikkinen, Miss. Laina                                 36.0
Futrelle, Mrs. Jacques Heath (Lily May Peel)           45.0
Allen, Mr. William Henry                               45.0
Name: Age, dtype: float64
'''

iloc:用position 来定位

df.iloc[0] # 取出第一条数据

在这里插入图片描述

df.iloc[0:5] # 展示前5行的信息
df.iloc[0:5,1:3] # 展示前5行，第2，3列的信息

loc :用lable 来定位

df = df.set_index('Name') 
df.loc['Heikkinen, Miss. Laina'] #展示Heikkinen, Miss. Laina的所有信息
df.loc['Heikkinen, Miss. Laina','Fare'] #展示Heikkinen, Miss. Laina的fare信息
df.loc['Heikkinen, Miss. Laina':'Allen, Mr. William Henry',:] # 展示Heikkinen, Miss. Laina到Allen, Mr. William Henry的所有信息
df.loc['Heikkinen, Miss. Laina','Fare'] = 1000 # 修改信息

bool类型的索引

df['Fare'] > 40 # 对于每一条数据返回True或者False
df[df['Fare'] > 40][:5] #返回fare大于50的前5条数据
df[df['Sex'] == 'male'][:5] #返回男性的前5条数据
df.loc[df['Sex'] == 'male','Age'].mean() #返回男性年龄的平均值
(df['Age'] > 70).sum()#年龄大于70的人数

数据处理

数值运算

import pandas as pd
df = pd.DataFrame([[1,2,3],[4,5,6]],index = ['a','b'],columns = ['A','B','C'])

df.sum() : 求和运算

# 默认按照列求和
df.sum()
'''
A    5
B    7
C    9
dtype: int64
'''
# axis=1 按照行求和
df.sum(axis = 1)
'''
a     6
b    15
dtype: int64
'''
# axis = 'columns' 按照行求和
df.sum(axis = 'columns')
'''
a     6
b    15
dtype: int64
'''

df.mean() :均值

# 默认按照列求均值
df.mean()
'''
A    2.5
B    3.5
C    4.5
dtype: float64
'''
# axis=1 按照行求均值
df.mean(axis = 1)
'''
a    2.0
b    5.0
dtype: float64
'''

df.min(),df.max(): 求最大值，最小值

df.min()
df.max()
'''
A    4
B    5
C    6
dtype: int64
'''

统计特性

df.describe() ；描述性统计

df.describe() # 描述性统计

在这里插入图片描述

median: 求中位数

df.median()
'''
A    2.5
B    3.5
C    4.5
dtype: float64
'''

df.cov() :返回一个协方差矩阵

df.cov()

在这里插入图片描述

df.corr(): 返回一个相关系数矩阵

df.corr()

在这里插入图片描述

value_counts() : 按照统计数量进行排序

#默认降序
df['Age'].value_counts() 
'''
24.00    30
22.00    27
18.00    26
19.00    25
...
0.42      1
Name: Age, Length: 88, dtype: int64
'''
#升序
df['Age'].value_counts(ascending = True) 
'''
0.42      1
...
18.00    26
22.00    27
24.00    30
'''

# 划分区间 
df['Age'].value_counts(ascending = True,bins = 5) 
'''
(64.084, 80.0]       11
(48.168, 64.084]     69
(0.339, 16.336]     100
(32.252, 48.168]    188
(16.336, 32.252]    346
Name: Age, dtype: int64
'''

dataframe结构

创

pd.DataFrame(data= ,index= ,columns = ) : 指定data, index , columns

data = [[1,2,3],[4,5,6]]
index = ['a','b']
columns = ['A','B','C']

df = pd.DataFrame(data=data,index=index,columns = columns)

在这里插入图片描述
df = pd.DataFrame(data,index = ) : 将columns嵌入data中

import numpy as np
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data,index = labels)
df.head()
'''
	age	animal	priority	visits
a	2.5	cat		yes			1
b	3.0	cat		yes			3
c	0.5	snake	no			2
d	NaN	dog		yes			3
e	5.0	dog		no		2
'''

查

df[] : 索引

df['A']
'''
a    1
b    4
Name: A, dtype: int64
'''

df.iloc[] : 通过位置索引

df.iloc[0]
'''
A    1
B    2
C    3
Name: a, dtype: int64
'''

df.loc[] : 通过index，columns索引

df.loc['a']
'''
A    1
B    2
C    3
Name: a, dtype: int64
'''

df.loc['a']['A'] #1

改

# 修改数据项
df.loc['a']['A'] = 150
'''
	A	B	C
a	150	2	3
b	4	5	6
'''
# 修改index
df.index = ['f','g']
'''
	A	B	C
f	150	2	3
g	4	5	6
'''

增

增添一行数据

df.loc['c'] = [1,2,3]
'''
	A	B	C
f	150	2	3
g	4	5	6
c	1	2	3
'''

增添一列指标

data = [[1,2,3],[4,5,6]]
index = ['j','k']
columns = ['A','B','C']

df2 = pd.DataFrame(data=data,index=index,columns = columns)
'''
	A	B	C
j	1	2	3
k	4	5	6
'''

df2['Q'] = [10,11]
'''
	A	B	C	Q
j	1	2	3	10
k	4	5	6	11
'''

pd.concat(): 拼接两组数据

# axis = 0 竖着拼接
df3 = pd.concat([df,df2],axis = 0)
'''
	A	B	C
f	150	2	3
g	4	5	6
c	1	2	3
j	1	2	3
k	4	5	6
'''

df4 = pd.DataFrame([[10,11],[12,13]],index=['j','k'],columns=['D','E'])
'''
	D	E
j	10	11
k	12	13
'''
# axis = 1 横着拼接
df5 = pd.concat([df2,df4],axis = 1)
'''
	A	B	C	Q	D	E
j	1	2	3	10	10	11
k	4	5	6	11	12	13
'''

删

df.drop() : 删除一行或者一列的数据

df5.drop(['j'],axis=0,inplace = True)
'''
	A	B	C	Q	D	E
k	4	5	6	11	12	13
'''

del ：删除一列数据

del df5['Q']
'''
	A	B	C	D	E
k	4	5	6	12	13
'''

df5.drop(['A','B','C'],axis = 1,inplace = True)
'''
	D	E
k	12	13
'''

groupby操作

合并

构建一个key值有重复的df结构

df = pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
                  'data':[0,5,10,5,10,15,10,15,20]})

在这里插入图片描述
比较麻烦的方法进行分组

for key in ['A','B','C']:
    print (key,df[df['key'] == key].sum())

在这里插入图片描述
用groupby 进行分组

df.groupby('key').sum()  #key值相同的数据项相加合并

在这里插入图片描述
用groupy 可以进行单个或者多个依据分组

import pandas as pd
import numpy as np
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                           'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
'''
	A	B	C			D
0	foo	one	-0.785250	1.010458
1	bar	one	2.549941	1.704677
2	foo	two	-0.255153	-0.603249
3	bar	three	-0.954625	0.117662
4	foo	two	-0.548512	0.648127
5	bar	two	-0.642762	-1.111568
6	foo	one	0.870697	0.556371
7	foo	three	0.839937	0.798669
'''

# 单个分组依据
grouped = df.groupby('A') 
grouped.count()
'''
	B	C	D
A			
bar	3	3	3
foo	5	5	5
'''

#多个分组依据
grouped = df.groupby(['A','B']) 
grouped.count()
'''
		C	D
A	B		
bar	one	1	1
	three	1	1
	two	1	1
foo	one	2	2
	three	1	1
	two	2	2
'''

groupy 中可以传入自定义函数

def get_letter_type(letter):
    if letter.lower() in 'aeiou':
        return 'a'
    else:
        return 'b'
grouped = df.groupby(get_letter_type,axis = 1)
grouped.count().iloc[0]
'''
a    1
b    3
Name: 0, dtype: int64
'''

有关groupy的其他操作
first() ,last(): 显示第一次或者最后一次出现各个组的对应值

s = pd.Series([1,2,3,1,2,3],[8,7,5,8,7,5])
'''
8    1
7    2
5    3
8    1
7    2
5    3
dtype: int64
'''

grouped = s.groupby(level = 0)
grouped.first() #自动升序分组：5，7，8 显示第一次出现5，7，8的值
'''
5    3
7    2
8    1
dtype: int64
'''

grouped.last() #显示最后次出现5，7，8的值
'''
5    3
7    2
8    1
dtype: int64
'''

grouped = s.groupby(level = 0,sort =False)
grouped.first()
'''
8    1
7    2
5    3
dtype: int64
'''

grouped.sum() : 同组内的数值求和

grouped.sum() 
'''
5    6
7    4
8    2
dtype: int64
'''

get_group() : 得到该组的所有数据

df2 = pd.DataFrame({'X':['A','B','A','B'],'Y':[1,2,3,4]})
'''
	X	Y
0	A	1
1	B	2
2	A	3
3	B	4
'''

df2.groupby(['X']).get_group('A')
'''
	X	Y
0	A	1
2	A	3
'''

df2.groupby(['X']).get_group('B')
'''

	X	Y
1	B	2
3	B	4
'''

pd.MultiIndex.from_arrays(arrays,names = ) : 多重索引分组

arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
index = pd.MultiIndex.from_arrays(arrays,names = ['first','second']) # 多重索引
s = pd.Series(np.random.randn(8),index = index)
'''
first  second
bar    one      -0.877562
       two      -1.296007
baz    one       1.026419
       two       0.445126
foo    one       0.044509
       two       0.271037
qux    one      -1.686649
       two       0.914649
dtype: float64
'''

#level =0 按照一级标签进行分组
grouped = s.groupby(level =0) 
grouped.sum()
'''
first
bar   -2.173569
baz    1.471545
foo    0.315545
qux   -0.772001
dtype: float64
'''
#level =1 按照二级标签进行分组
grouped = s.groupby(level = 1)
grouped.sum()
'''
second
one   -1.493284
two    0.334805
dtype: float64
'''
#level =first 按照一级标签进行分组
grouped = s.groupby(level = 'first')
grouped.sum()
'''
first
bar   -2.173569
baz    1.471545
foo    0.315545
qux   -0.772001
dtype: float64
'''

普通方法对已经分好的组进行按组操作

df = pd.read_csv('./data/titanic.csv')
df.groupby('Sex')['Age'].mean() # 分别计算男性和女性的的年龄平均值

aggregate() : 对已经分好的组进行按组单个操作

import numpy as np
df.groupby('key').aggregate(np.mean) # key值相同的数据项以均值合并

'''
df=
	A	B	C			D
0	foo	one	-0.785250	1.010458
1	bar	one	2.549941	1.704677
2	foo	two	-0.255153	-0.603249
3	bar	three	-0.954625	0.117662
4	foo	two	-0.548512	0.648127
5	bar	two	-0.642762	-1.111568
6	foo	one	0.870697	0.556371
7	foo	three	0.839937	0.798669
'''
# 多重依据分组
grouped = df.groupby(['A','B'])
grouped.aggregate(np.sum)
'''
			C			D
A	B		
bar	one		2.549941	1.704677
	three	-0.954625	0.117662
	two		-0.642762	-1.111568
foo	one		0.085447	1.566829
	three	0.839937	0.798669
	two		-0.803665	0.044878
'''
# as_index = False
grouped = df.groupby(['A','B'],as_index = False)
grouped.aggregate(np.sum)
'''
	A	B	C	D
0	bar	one	2.549941	1.704677
1	bar	three	-0.954625	0.117662
2	bar	two	-0.642762	-1.111568
3	foo	one	0.085447	1.566829
4	foo	three	0.839937	0.798669
5	foo	two	-0.803665	0.044878
'''

# reset_index()
df.groupby(['A','B']).sum().reset_index()
'''
	A	B	C	        D
0	bar	one	2.549941	1.704677
1	bar	three	-0.954625	0.117662
2	bar	two	-0.642762	-1.111568
3	foo	one	0.085447	1.566829
4	foo	three	0.839937	0.798669
5	foo	two	-0.803665	0.044878
'''

#组合个数
grouped = df.groupby(['A','B'])
grouped.size() 
'''
A    B    
bar  one      1
     three    1
     two      1
foo  one      2
     three    1
     two      2
dtype: int64
'''

agg() : 对已经分好的组进行按组多个操作

grouped = df.groupby('A')
grouped['C'].agg([np.sum,np.mean,np.std])
'''
	sum			mean		std
A			
bar	0.952553	0.317518	1.939613
foo	0.121719	0.024344	0.781542
'''
grouped['C'].agg({'res_sum':np.sum,'res_mean':np.mean,'res_std':np.std})
'''
	res_std		res_mean	res_sum
A			
bar	1.939613	0.317518	0.952553
foo	0.781542	0.024344	0.121719
'''

索引

创建数据

s = pd.Series(np.arange(5),index = np.arange(5)[::-1],dtype='int64')
'''
4    0
3    1
2    2
1    3
0    4
dtype: int64
'''

isin : 判断数组中的值是否在列表中

# 返回值是布尔类型
s.isin([1,3,4])
'''
4    False
3     True
2    False
1     True
0     True
dtype: bool
'''
# 通过mask索引
s[s.isin([1,3,4])]
'''
3    1
1    3
0    4
dtype: int64
'''

pd.MultiIndex.from_product : 多重依据分类

s2 = pd.Series(np.arange(6),index = pd.MultiIndex.from_product([[0,1],['a','b','c']]))
'''
0  a    0
   b    1
   c    2
1  a    3
   b    4
   c    5
dtype: int32
'''

iloc: 通过位置索引

s2.iloc[s2.index.isin([(1,'a'),(0,'b')])]
'''
0  b    1
1  a    3
dtype: int32
'''

其他方法索引

s
'''
4    0
3    1
2    2
1    3
0    4
dtype: int64
'''
s[s>2]
'''
1    3
0    4
dtype: int64
'''

df.select : 传入lambda 函数进行索引

dates = pd.date_range('20171124',periods=8)
df = pd.DataFrame(np.random.randn(8,4),index=dates,columns=['A','B','C','D'])
'''
			A			B			C			D
2017-11-24	-1.690231	-0.338101	-1.071022	1.084637
2017-11-25	1.292291	0.449885	0.468264	0.637102
2017-11-26	0.602494	-0.591658	0.301893	-1.050524
2017-11-27	-1.132170	1.310110	0.552812	0.370947
2017-11-28	-0.113234	-0.859983	0.381977	0.371730
2017-11-29	0.616029	0.209225	1.879964	0.179152
2017-11-30	0.554969	-0.656240	2.449274	-0.302113
2017-12-01	0.700342	1.068990	-0.572698	0.577581
'''

df.select(lambda x:x=='A',axis='columns')
'''
			A
2017-11-24	-1.690231
2017-11-25	1.292291
2017-11-26	0.602494
2017-11-27	-1.132170
2017-11-28	-0.113234
2017-11-29	0.616029
2017-11-30	0.554969
2017-12-01	0.700342
'''

df.where() : 通过设置条件进行索引, 并进行相关操作

df.where(df < 0) # 小于0的位置被置为NaN

df.where(df < 0,-df) # 小于0的值被置为相反数

df.query(): 通过设置条件进行索引

df = pd.DataFrame(np.random.rand(10,3),columns = list('abc'))
'''
		a			b			c
0	0.760971	0.910097	0.480540
1	0.242792	0.778342	0.852441
2	0.147953	0.449719	0.539780
3	0.519164	0.936192	0.402399
4	0.365343	0.148621	0.176917
5	0.837852	0.283028	0.527734
6	0.729312	0.066871	0.747968
7	0.502851	0.462246	0.116735
8	0.472404	0.517753	0.945877
9	0.962282	0.300276	0.258252
'''

df.query('(a<b)') #列出a<d的数据项
df.query('(a<b) & (b<c)')

serise

创

pd.Series(data = ,index = )

data = [10,11,12]
index = ['a','b','c']
s = pd.Series(data = data,index = index)
'''
a    10
b    11
c    12
dtype: int64
'''

查

# 用[]进行切片和索引
s[0] #10
s[0:2]
'''
a    10
b    11
dtype: int64
'''

# 用mask进行索引
mask = [True,False,True]
s[mask]
'''
a    10
c    12
dtype: int64
'''

# 用loc和iloc 进行索引
s.loc['b'] #11
s.iloc[1] # 11

改

# 直接通过索引[] 赋值修改
s1 = s.copy()
s1['a'] = 100
'''
a    100
b     11
c     12
dtype: int64
'''

# 通过replace 进行修改
s1.replace(to_replace = 100,value = 101,inplace = True)
'''
a    101
b     11
c     12
dtype: int64
'''

# 直接赋值对索引进行修改
s1.index # Index(['a', 'b', 'c'], dtype='object')
s1.index = ['a','b','d']
'''
a    101
b     11
d     12
dtype: int64
'''

# 通过rename进行修改
s1.rename(index = {'a':'A'},inplace = True)
'''
A    101
b     11
d     12
dtype: int64
'''

增

s.append() : 对两个serise进行合并

data = [100,110]
index = ['h','k']
s2 = pd.Series(data = data,index = index)
'''
h    100
k    110
dtype: int64
'''
s3 = s1.append(s2)
s3['j'] = 500
'''
A    101
b     11
d     12
j    500
h    100
k    110
dtype: int64
'''

s1.append(s2,ignore_index = True)
'''
0    101
1     11
2     12
3    500
4    100
5    110
dtype: int64
'''

删

用del 或者drop 进行删除操作

s1
'''
A    101
b     11
d     12
j    500
dtype: int64
'''

del s1['A']
'''
b     11
d     12
j    500
dtype: int64
'''

s1.drop(['b','d'],inplace = True)
'''
j    500
dtype: int64
'''

合并

merge

import pandas as pd

left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                    'A': ['A0', 'A1', 'A2', 'A3'], 
                    'B': ['B0', 'B1', 'B2', 'B3']})
'''
	A	B	key
0	A0	B0	K0
1	A1	B1	K1
2	A2	B2	K2
3	A3	B3	K3
'''
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                    'C': ['C0', 'C1', 'C2', 'C3'], 
                    'D': ['D0', 'D1', 'D2', 'D3']})
'''
	C	D	key
0	C0	D0	K0
1	C1	D1	K1
2	C2	D2	K2
3	C3	D3	K3
'''

res = pd.merge(left, right)
'''
	A	B	key	C	D
0	A0	B0	K0	C0	D0
1	A1	B1	K1	C1	D1
2	A2	B2	K2	C2	D2
3	A3	B3	K3	C3	D3
'''

res = pd.merge(left, right, on = 'key')
'''
	A	B	key	C	D
0	A0	B0	K0	C0	D0
1	A1	B1	K1	C1	D1
2	A2	B2	K2	C2	D2
3	A3	B3	K3	C3	D3
'''

left = pd.DataFrame({'key1': ['K0', 'K1', 'K2', 'K3'],
                     'key2': ['K0', 'K1', 'K2', 'K3'],
                    'A': ['A0', 'A1', 'A2', 'A3'], 
                    'B': ['B0', 'B1', 'B2', 'B3']})
'''
    A   B key1 key2
0  A0  B0   K0   K0
1  A1  B1   K1   K1
2  A2  B2   K2   K2
3  A3  B3   K3   K3
'''
right = pd.DataFrame({'key1': ['K0', 'K1', 'K2', 'K3'],
                      'key2': ['K0', 'K1', 'K2', 'K4'],
                    'C': ['C0', 'C1', 'C2', 'C3'], 
                    'D': ['D0', 'D1', 'D2', 'D3']})
'''
    C   D key1 key2
0  C0  D0   K0   K0
1  C1  D1   K1   K1
2  C2  D2   K2   K2
3  C3  D3   K3   K4
'''
#默认交集
res = pd.merge(left, right, on = ['key1', 'key2']) 
'''
	A	B	key1	key2	C	D
0	A0	B0	K0		K0		C0	D0
1	A1	B1	K1		K1		C1	D1
2	A2	B2	K2		K2		C2	D2
'''

# how = 'outer'并集
res = pd.merge(left, right, on = ['key1', 'key2'], how = 'outer') 
'''
	A	B	key1	key2	C	D
0	A0	B0	K0		K0		C0	D0
1	A1	B1	K1		K1		C1	D1
2	A2	B2	K2		K2		C2	D2
3	A3	B3	K3		K3		NaN	NaN
4	NaN	NaN	K3		K4		C3	D3
'''

# indicator = True 添加提示信息
res = pd.merge(left, right, on = ['key1', 'key2'], how = 'outer', indicator = True)
'''
	A	B	key1	key2	C	D	_merge
0	A0	B0	K0		K0		C0	D0	both
1	A1	B1	K1		K1		C1	D1	both
2	A2	B2	K2		K2		C2	D2	both
3	A3	B3	K3		K3		NaN	NaN	left_only
4	NaN	NaN	K3		K4		C3	D3	right_only
'''

# 按照'left'的进行合并
res = pd.merge(left, right, how = 'left')
'''
	A	B	key1	key2	C	D
0	A0	B0	K0		K0		C0	D0
1	A1	B1	K1		K1		C1	D1
2	A2	B2	K2		K2		C2	D2
3	A3	B3	K3		K3		NaN	NaN
'''

# 按照'left'的进行合并
res = pd.merge(left, right, how = 'right')
'''
	A	B	key1	key2	C	D
0	A0	B0	K0		K0		C0	D0
1	A1	B1	K1		K1		C1	D1
2	A2	B2	K2		K2		C2	D2
3	NaN	NaN	K3		K4		C3	D3
'''

join

left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3'],
                    'key': ['K0', 'K1', 'K0', 'K1']})
'''
	A	B	key
0	A0	B0	K0
1	A1	B1	K1
2	A2	B2	K0
3	A3	B3	K1
'''
right = pd.DataFrame({'C': ['C0', 'C1'],
                       'D': ['D0', 'D1']},
                       index=['K0', 'K1'])
'''
	C	D
K0	C0	D0
K1	C1	D1
'''

result = left.join(right, on='key')
'''
	A	B	key	C	D
0	A0	B0	K0	C0	D0
1	A1	B1	K1	C1	D1
2	A2	B2	K0	C0	D0
3	A3	B3	K1	C1	D1
'''

显示格式设置

pd.set_option : 控制显示的格式，如最大行数，列数，宽度，小数位数

# display.max_rows 显示的最大行数
pd.get_option('display.max_rows') #60
pd.set_option('display.max_rows',6) 
pd.Series(index = range(0,100))
'''
0    NaN
1    NaN
2    NaN
      ..
97   NaN
98   NaN
99   NaN
Length: 100, dtype: float64
'''

#display.max_columns  显示的最大列数
pd.get_option('display.max_columns') #20
pd.set_option('display.max_columns',30)
pd.DataFrame(columns = range(0,30))

# display.max_colwidth 显示的最大宽度
pd.get_option('display.max_colwidth') #50
pd.set_option('display.max_colwidth',100)
pd.Series(index = ['A'],data=['t'*70])

# display.precision 设置显示的小数位数
pd.get_option('display.precision') #6
pd.Series(data = [1.23456789236546])
'''
0    1.234568
dtype: float64
'''
pd.set_option('display.precision',5)
pd.Series(data = [1.23456789236546])
'''
0    1.23457
dtype: float64
'''

特殊表格

数据透视表

数据透视表用来做数据透视，可以通过一个或多个键分组聚合DataFrame中的数据，通过aggfunc参数决定聚合类型，是groupby的高级功能。
pd.pivot_table（data, values = None，index = None，columns = None，aggfunc =‘mean’，fill_value = None，margin = False，dropna = True，margins_name =‘All’ ）
data：DataFrame对象
values：要聚合的列或列的列表
index：数据透视表的index，从原数据的列中筛选
columns：数据透视表的columns，从原数据的列中筛选
aggfunc：用于聚合的函数，默认为numpy.mean，支持numpy计算方法
fill_value: 用于替换缺失值的值
margin: 添加所有行/列
dropna: 不包括条目为 NaN的列，默认为True
margin_name: 当margin为True时，将包含总计的行/列的名称

import pandas as pd
example = pd.DataFrame({'Month': ["January", "January", "January", "January", 
                                  "February", "February", "February", "February", 
                                  "March", "March", "March", "March"],
                   'Category': ["Transportation", "Grocery", "Household", "Entertainment",
                                "Transportation", "Grocery", "Household", "Entertainment",
                                "Transportation", "Grocery", "Household", "Entertainment"],
                   'Amount': [74., 235., 175., 100., 115., 240., 225., 125., 90., 260., 200., 120.]})

在这里插入图片描述

pivot(index =,columns=,values =) : 返回数据透视表

example_pivot = example.pivot(index = 'Category',columns= 'Month',values = 'Amount')
'''
Month			February	January	March
Category			
Entertainment	125.0		100.0	120.0
Grocery			240.0		235.0	260.0
Household		225.0		175.0	200.0
Transportation	115.0		74.0	90.0
'''

sum :求和

# axis = 1 按照行求和
example_pivot.sum(axis = 1)
'''
Category
Entertainment     345.0
Grocery           735.0
Household         600.0
Transportation    279.0
dtype: float64
'''
# axis = 0 按照列求和
example_pivot.sum(axis = 0)
'''
Month
February    705.0
January     584.0
March       670.0
dtype: float64
'''

pivot_table: 对数据透视表进行相关操作

# 默认值求平均
df = pd.read_csv('./data/titanic.csv')
df.pivot_table(index = 'Sex',columns='Pclass',values='Fare')
'''
Pclass	1			2			3
Sex			
female	106.125798	21.970121	16.118810
male	67.226127	19.741782	12.661633
'''

# aggfunc='max' 求最大值
df.pivot_table(index = 'Sex',columns='Pclass',values='Fare',aggfunc='max')
'''
最大值
Pclass	1			2		3
Sex			
female	512.3292	65.0	69.55
male	512.3292	73.5	69.55
'''

# aggfunc='count' 统计个数
df.pivot_table(index = 'Sex',columns='Pclass',values='Fare',aggfunc='count')
'''
统计数目
Pclass	1	2	3
Sex			
female	94	76	144
male	122	108	347
'''

交叉表

默认情况下，crosstab是用来计算因子的频率表。
pd.crosstab()参数：
pd.crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, dropna=True, normalize=False)
index：要在行中分组的值
columns：要在列中分组的值
values：根据因子聚合的值数组，需指定aggfunc
aggfunc：如指定，还需指定value
normalize：将所有值除以值的总和进行归一化，为True时候显示百分比

pd.crosstab() :

# 默认计算因子的频率表。
pd.crosstab(index = df['Sex'],columns = df['Pclass'])
'''
Pclass	1	2	3
Sex			
female	94	76	144
male	122	108	347
'''

# aggfunc='mean' 求平均值
df.pivot_table(index = 'Pclass',columns='Sex',values='Survived',aggfunc='mean')
'''
Sex	female		male
Pclass		
1	0.968085	0.368852
2	0.921053	0.157407
3	0.500000	0.135447
'''

# 设置条件进行筛选
df['Underaged'] = df['Age'] <= 18
df.pivot_table(index = 'Underaged',columns='Sex',values='Survived',aggfunc='mean')
'''
Sex		female		male
Underaged		
False	0.760163	0.167984
True	0.676471	0.338028
'''

时间操作

时间转换

datetime.datetime(year=,month=,day=,hour=,minute=)

%matplotlib notebook
import datetime
dt = datetime.datetime(year=2017,month=11,day=24,hour=10,minute=30)
'''
datetime.datetime(2017, 11, 24, 10, 30)
'''

pd.Timestamp ：设置时间
pd.Timedelta : 时间增量
pd.to_datetime : 时间格式转换

import pandas as pd
ts = pd.Timestamp('2017-11-24')
ts.month
ts.day
ts + pd.Timedelta('5 days')# Timestamp('2017-11-29 00:00:00')
pd.to_datetime('2017-11-24') #Timestamp('2017-11-24 00:00:00')
pd.to_datetime('24/11/2017')# Timestamp('2017-11-24 00:00:00')

时间序列

to_datetime() : 传入的参数是时间序列

s = pd.Series(['2017-11-24 00:00:00','2017-11-25 00:00:00','2017-11-26 00:00:00'])
'''
0    2017-11-24 00:00:00
1    2017-11-25 00:00:00
2    2017-11-26 00:00:00
dtype: object
'''

ts = pd.to_datetime(s)
'''
0   2017-11-24
1   2017-11-25
2   2017-11-26
dtype: datetime64[ns]
'''

dt.hour : 通过小时进行序列的索引
dt.weekday : 通过星期进行序列的索引

ts.dt.hour
'''
0    0
1    0
2    0
dtype: int64
'''

ts.dt.weekday
'''
0    4
1    5
2    6
dtype: int64
'''

pd.date_range(start=,periods = ,freq = ) :生成时间序列

pd.Series(pd.date_range(start='2017-11-24',periods = 10,freq = '12H'))
'''
0   2017-11-24 00:00:00
1   2017-11-24 12:00:00
2   2017-11-25 00:00:00
3   2017-11-25 12:00:00
4   2017-11-26 00:00:00
5   2017-11-26 12:00:00
6   2017-11-27 00:00:00
7   2017-11-27 12:00:00
8   2017-11-28 00:00:00
9   2017-11-28 12:00:00
dtype: datetime64[ns]
'''

时间序列的截取的方式

data = pd.read_csv('./data/flowdata.csv')
data.head()
'''

Time	L06_347							LS06_347	LS06_348
0		2009-01-01 00:00:00	0.137417	0.097500	0.016833
1		2009-01-01 03:00:00	0.131250	0.088833	0.016417
2		2009-01-01 06:00:00	0.113500	0.091250	0.016750
3		2009-01-01 09:00:00	0.135750	0.091500	0.016250
4		2009-01-01 12:00:00	0.140917	0.096167	0.017000
'''

data['Time'] = pd.to_datetime(data['Time'])
data = data.set_index('Time')
'''

L06_347							LS06_347	LS06_348
Time			
2009-01-01 00:00:00	0.137417	0.097500	0.016833
2009-01-01 03:00:00	0.131250	0.088833	0.016417
2009-01-01 06:00:00	0.113500	0.091250	0.016750
2009-01-01 09:00:00	0.135750	0.091500	0.016250
...
'''

# 时间序列截取方式
data[pd.Timestamp('2012-01-01 09:00'):pd.Timestamp('2012-01-01 19:00')] #截取一段数据
data[('2012-01-01 09:00'):('2012-01-01 19:00')]
data.tail(10) #截取最后10个数据
data[data.index.month == 1]
data[(data.index.hour > 8) & (data.index.hour <12)]
data.between_time('08:00','12:00')

时间抽样

data.resample : 对时间进行抽样

data.resample('D').mean().head() #间隔一天取平均值
data.resample('D').max().head()
data.resample('3D').mean().head()
data.resample('M').mean().head()
data.resample('M').mean().plot()

排序

sort_values

import pandas as pd
data = pd.DataFrame({'group':['a','a','a','b','b','b','c','c','c'],
                    'data':[4,3,2,1,12,3,4,5,7]})
'''
  data  group
0	4	a
1	3	a
2	2	a
3	1	b
4	12	b
5	3	b
6	4	c
7	5	c
8	7	c
'''

#按照group降序的基础上，data升序
data.sort_values(by=['group','data'],ascending = [False,True],inplace=True) 
'''
	data	group
6	4	c
7	5	c
8	7	c
3	1	b
5	3	b
4	12	b
2	2	a
1	3	a
0	4	a
'''

data = pd.DataFrame({'k1':['one']*3+['two']*4,
                   'k2':[3,2,1,3,3,4,4]})
'''
	k1	k2
0	one	3
1	one	2
2	one	1
3	two	3
4	two	3
5	two	4
6	two	4
'''

# 根据关键字进行排序
data.sort_values(by='k2')
'''
2	one	1
1	one	2
0	one	3
3	two	3
4	two	3
5	two	4
6	two	4
'''

去重

drop_duplicates() : 去重操作

# k1, k2 的值都相等时才除去
data.drop_duplicates()
'''
	k1	k2
0	one	3
1	one	2
2	one	1
3	two	3
5	two	4
'''

# 只针对k1去重
data.drop_duplicates(subset='k1') 
'''
	k1	k2
0	one	3
3	two	3
'''

映射

apply

data = pd.DataFrame({'food':['A1','A2','B1','B2','B3','C1','C2'],'data':[1,2,3,4,5,6,7]})
'''
   data food
0	1	A1
1	2	A2
2	3	B1
3	4	B2
4	5	B3
5	6	C1
6	7	C2
'''

def food_map(series):
    if series['food'] == 'A1':
        return 'A'
    elif series['food'] == 'A2':
        return 'A'
    elif series['food'] == 'B1':
        return 'B'
    elif series['food'] == 'B2':
        return 'B'
    elif series['food'] == 'B3':
        return 'B'
    elif series['food'] == 'C1':
        return 'C'
    elif series['food'] == 'C2':
        return 'C'
data['food_map'] = data.apply(food_map,axis = 'columns')
'''
  data food	food_map
0	1	A1	A
1	2	A2	A
2	3	B1	B
3	4	B2	B
4	5	B3	B
5	6	C1	C
6	7	C2	C
'''

map

food2Upper = {
    'A1':'A',
    'A2':'A',
    'B1':'B',
    'B2':'B',
    'B3':'B',
    'C1':'C',
    'C2':'C'
}
data['upper'] = data['food'].map(food2Upper)
'''
  data	food food_map	upper
0	1	A1	A	A
1	2	A2	A	A
2	3	B1	B	B
3	4	B2	B	B
4	5	B3	B	B
5	6	C1	C	C
6	7	C2	C	C
'''

assign : 添加一列

import numpy as np
df = pd.DataFrame({'data1':np.random.randn(5),
                  'data2':np.random.randn(5)})
df2 = df.assign(ration = df['data1']/df['data2'])
'''
	data1		data2		ration
0	-1.069925	-0.186540	5.735617
1	0.636127	0.020425	31.143814
2	0.366197	-0.102836	-3.560992
3	-0.975327	0.451201	-2.161624
4	-1.562407	-2.436845	0.641160
'''

drop : 去除一行

df2.drop('ration',axis='columns',inplace=True)
'''
	data1		data2
0	-1.069925	-0.186540
1	0.636127	0.020425
2	0.366197	-0.102836
3	-0.975327	0.451201
4	-1.562407	-2.436845
'''

分类

pd.cut: 返回每个元素对应的区间

ages = [15,18,20,21,22,34,41,52,63,79]
bins = [10,40,80]
bins_res = pd.cut(ages,bins)
'''
[(10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (40, 80], (40, 80], (40, 80], (40, 80]]
'''

lables : 返回每个元素对应区间的标号

bins_res.labels
'''
array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1], dtype=int8)
'''

value_counts: 返回每个区间对应的个数

pd.value_counts(bins_res)
'''
(10, 40]    6
(40, 80]    4
dtype: int64
'''

pd.cut(ages,[10,30,50,80])
'''
[(10, 30], (10, 30], (10, 30], (10, 30], (10, 30], (30, 50], (30, 50], (50, 80], (50, 80], (50, 80]]
'''

group_names = ['Yonth','Mille','Old']
pd.value_counts(pd.cut(ages,[10,20,50,80],labels=group_names))
'''
Mille    4
Old      3
Yonth    3
dtype: int64
'''

判空

nan : 表示数据为空

df = pd.DataFrame([range(3),[0, np.nan,0],[0,0,np.nan],range(3)])
'''
	0	1	2
0	0	1.0	2.0
1	0	NaN	0.0
2	0	0.0	NaN
3	0	1.0	2.0
'''

isnull() : 返回值为布尔类型，若为nan则返回true, 否则返回false

df.isnull()
'''
	0		1		2
0	False	False	False
1	False	True	False
2	False	False	True
3	False	False	False
'''

isnull().any() : 返回值为布尔类型，若该条数据全为nan则返回true, 否则返回false

# 默认判断每行数据
df.isnull().any()
'''
0    False
1     True
2     True
dtype: bool
'''

# axis = 1 判断每一列的数据
df.isnull().any(axis = 1)
'''
0    False
1     True
2     True
3    False
dtype: bool
'''

df.fillna(n) : 将值为nan的数据用n填充

df.fillna(5)
'''
	0	1	2
0	0	1.0	2.0
1	0	5.0	0.0
2	0	0.0	5.0
3	0	1.0	2.0
'''

df[df.isnull().any(axis = 1)]
'''
	0	1	2
1	0	NaN	0.0
2	0	0.0	NaN
'''

字符串

import pandas as pd
import numpy as np
s = pd.Series(['A','b','B','gaer','AGER',np.nan])
'''
0       A
1       b
2       B
3    gaer
4    AGER
5     NaN
dtype: object
'''

大小写转换

str.lower(), str.upper() : 转换为小写，大写字母格式

s.str.lower()
'''
0       a
1       b
2       b
3    gaer
4    ager
5     NaN
dtype: object
'''

s.str.upper()
'''
0       A
1       B
2       B
3    GAER
4    AGER
5     NaN
dtype: object
'''

长度

str.len(): 求字符串的长度

s.str.len()
'''
0    1.0
1    1.0
2    1.0
3    4.0
4    4.0
5    NaN
dtype: float64
'''

去除空格

str.strip(): 去除字符串左右两边的空格。
str.lstrip(): 去除字符串左边的空格。
str.rstrip(): 去除字符串右边的空格。

index = pd.Index(['  tang','  yu   ','di'])
Index(['  tang', '  yu   ', 'di'], dtype='object')

index.str.strip()
'''
Index(['tang', 'yu', 'di'], dtype='object')
'''

index.str.lstrip()
'''
Index(['tang', 'yu   ', 'di'], dtype='object')
'''

index.str.rstrip()
'''
Index(['  tang', '  yu', 'di'], dtype='object')
'''

替换

str.replace() : 将字符串中的某一个字符用另外一个字符填充

df = pd.DataFrame(np.random.randn(3,2),columns = ['A a','B b'],index = range(3))
'''
	A a			B b
0	-1.392628	1.020082
1	0.866707	0.654731
2	-0.320871	1.360513
'''

df.columns = df.columns.str.replace(' ','_')
'''
	A_a			B_b
0	-1.392628	1.020082
1	0.866707	0.654731
2	-0.320871	1.360513
'''

分割

str.split(): 按照分隔符对字符串进行分割

s = pd.Series(['a_b_C','c_d_e','f_g_h'])
'''
0    a_b_C
1    c_d_e
2    f_g_h
dtype: object
'''
# 将字符串按照分隔符分割为list
s.str.split('_')
'''
0    [a, b, C]
1    [c, d, e]
2    [f, g, h]
dtype: object
'''
# expand = True 将字符串按照分隔符分割为多个columns
s.str.split('_',expand = True)
'''
	0	1	2
0	a	b	C
1	c	d	e
2	f	g	h
'''

# 指定分割的次数
s.str.split('_',expand = True,n=1)
'''
	0	1
0	a	b_C
1	c	d_e
2	f	g_h
'''

str.contains() : 检查字符串中是否包含某个子串，返回值为布尔类型

s = pd.Series(['A','Aas','Afgew','Ager','Agre','Ager'])
'''
0        A
1      Aas
2    Afgew
3     Ager
4     Agre
5     Ager
dtype: object
'''

s.str.contains('Ag')
'''
0    False
1    False
2    False
3     True
4     True
5     True
dtype: bool
'''

str.get_dummies(sep = ) : 按照分割符sep 将字符串分割成多个columns

s = pd.Series(['a','a|b','a|c'])
'''
0      a
1    a|b
2    a|c
dtype: object
'''
s.str.get_dummies(sep = '|')
'''
	a	b	c
0	1	0	0
1	1	1	0
2	1	0	1
'''

绘图

折线图

单条折线图

%matplotlib inline
import pandas as pd
import numpy as np

s = pd.Series(np.random.randn(10),index = np.arange(0,100,10))
s.plot()

在这里插入图片描述

多条折线图

df = pd.DataFrame(np.random.randn(10, 4).cumsum(0), 
               index = np.arange(0, 100, 10), 
               columns = ['A', 'B', 'C', 'D'])
df.plot()

在这里插入图片描述

柱状图

水平和竖直柱状图

import matplotlib.pyplot as plt

fig,axes = plt.subplots(2,1) #子图
data = pd.Series(np.random.rand(16),index=list('abcdefghijklmnop'))
data.plot(ax = axes[0],kind='bar') #竖直柱状图
data.plot(ax = axes[1],kind='barh') #水平柱状图

在这里插入图片描述

多个指标柱状图

df = pd.DataFrame(np.random.rand(6, 4), 
               index = ['one', 'two', 'three', 'four', 'five', 'six'], 
               columns = pd.Index(['A', 'B', 'C', 'D'], name = 'Genus'))
df.plot(kind='bar')

在这里插入图片描述

tips = pd.read_csv('tips.csv')
tips.total_bill.plot(kind='hist',bins=50)
'''
	total_bill	tip		sex		smoker	day	time	size
0	16.99		1.01	Female	No		Sun	Dinner	2
1	10.34		1.66	Male	No		Sun	Dinner	3
2	21.01		3.50	Male	No		Sun	Dinner	3
3	23.68		3.31	Male	No		Sun	Dinner	2
4	24.59		3.61	Female	No		Sun	Dinner	4
'''

直方图

tips = pd.read_csv('tips.csv')
tips.head()
tips.total_bill.plot(kind='hist',bins=50)

在这里插入图片描述

散点图

macro = pd.read_csv('macrodata.csv')
macro.head()
data = macro[['quarter','realgdp','realcons']]
data.plot.scatter('quarter','realgdp')

在这里插入图片描述

pd.scatter_matrix(data,color='g',alpha=0.3)

在这里插入图片描述

大数据处理

向下类型转换

import pandas as pd

gl = pd.read_csv('game_logs.csv')
gl.head()

gl.info(memory_usage='deep') # 占用内存大小

for dtype in ['float64','object','int64']: #查看每种类型平均占用内存大小
    selected_dtype = gl.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))
  

import numpy as np
int_types = ["uint8", "int8", "int16","int32","int64"]
for it in int_types:
    print(np.iinfo(it)) #查看每种类型表示数的范围大小

def mem_usage(pandas_obj): #计算内存占用
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

gl_int = gl.select_dtypes(include=['int64'])
converted_int = gl_int.apply(pd.to_numeric,downcast='unsigned') #转换数据类型
print(mem_usage(gl_int)) # 7.87 MB
print(mem_usage(converted_int))# 1.48 MB

去重

gl_obj = gl.select_dtypes(include = ['object']).copy()
gl_obj.describe() #重复值过多

dow = gl_obj.day_of_week
dow.head()
dow_cat = dow.astype('category') 
dow_cat.head()
'''
0    Thu
1    Fri
2    Sat
3    Mon
4    Tue
Name: day_of_week, dtype: category
Categories (7, object): [Fri, Mon, Sat, Sun, Thu, Tue, Wed]
'''
dow_cat.head(10).cat.codes#object类型转换为category
'''
0    4
1    0
2    2
3    1
4    5
5    4
6    2
7    2
8    1
9    5
dtype: int8
'''

时间压缩

date = optimized_gl.date
date[:5]
'''
0    18710504
1    18710505
2    18710506
3    18710508
4    18710509
Name: date, dtype: uint32
'''
print (mem_usage(date)) #0.66MB

optimized_gl['date'] = pd.to_datetime(date,format='%Y%m%d')
print (mem_usage(optimized_gl['date'])) #1.31 MB

optimized_gl['date'][:5]
'''
0   1871-05-04
1   1871-05-05
2   1871-05-06
3   1871-05-08
4   1871-05-09
Name: date, dtype: datetime64[ns]
'''

zyw2002

关注

1
点赞
踩
7

收藏

觉得还不错? 一键收藏
打赏
0
评论
Python库（2）—— Pandas 数据分析处理库

Pandas —— 数据分析处理库安装Pandas: pip install pandasimport pandas as pdpd.show_versions() # 显示当前版本信息读取数据读数据pd.read_csv() : 读取csv类型数据df = pd.read_csv('./data/titanic.csv')df.head(n) : 显示前n条数据df.head(6) # head()可以读取前几条数据,指定前几条都可以df.info() : 返回数据的总体信息
复制链接

扫一扫