Pandas 基本操作介绍
本节介绍 Pandas 的基本操作, 方便日后查询.
载入相关的库
import warnings
warnings. filterwarnings( 'ignore' )
import os
import sys
from os. path import join, exists
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
% matplotlib inline
创建操作数据
假设下面这些数据存储在 data.csv 文件中.
日期,年龄,收入,支出
2020-02-02,22,100000.9,20000.6
2020-02-03,18,20800.8,10000.3
2020-02-02,32,508008.8,10300.4
2020-02-02,26,332342.8,101240.3
2020-02-03,24,332344.6,101240.5
2020-02-04,54,134364.2,11240.5
2020-02-03,53,254354.8,11240.0
2020-02-04,13,254234.2,1140.0
数据读取
下面提供一些数据文件读取的函数, 方便将 csv 或 excel 中的数据读入到 pandas 中.
从文件中读取数据
def read_raw_csv_file ( file_name) :
"""
直接读取 csv 文件, 不做任何预处理
"""
df = pd. read_csv( file_name, encoding= 'utf-8' )
return df
def preprocess ( raw_csv_file, output_file) :
"""
处理编码问题
"""
print ( '>>>[Preprocess] Preprocessing Raw CSV File: {}' . format ( raw_csv_file) )
with open ( raw_csv_file, 'rb' ) as f:
with open ( output_file, 'w' ) as out:
for line in f:
line = line. decode( 'gb2312' ) . strip( ) . rstrip( ',' )
out. write( '{}\n' . format ( line) )
print ( '>>>[Preprocess] Saving CSV File To: {}' . format ( output_file) )
def read_csv_with_preprocess ( raw_csv_file, tmp_file= 'tmp.csv' , rm_tmp= True ) :
"""
读取原始 csv 文件, 同时包含预处理.
预处理是为了解决中文内容无法正确读取的问题.
"""
preprocess( raw_csv_file, tmp_file)
df = read_raw_csv_file( tmp_file)
if rm_tmp:
os. remove( tmp_file)
return df
def read_xlsx_file ( file_name) :
"""
读取 xlsx 文件, 一般读取第一个 sheet
"""
ordered_dict_dfs = pd. read_excel( file_name, sheet_name= None )
dfs = list ( ordered_dict_dfs. values( ) ) [ 0 ]
return dfs
file_name = 'data.csv'
df = read_raw_csv_file( file_name)
df. head( )
日期 年龄 收入 支出 0 2020-02-02 22 100000.9 20000.6 1 2020-02-03 18 20800.8 10000.3 2 2020-02-02 32 508008.8 10300.4 3 2020-02-02 26 332342.8 101240.3 4 2020-02-03 24 332344.6 101240.5
df. tail( 3 )
日期 年龄 收入 支出 5 2020-02-04 54 134364.2 11240.5 6 2020-02-03 53 254354.8 11240.0 7 2020-02-04 13 254234.2 1140.0
df = pd. read_csv( 'data1.csv' , header= None , encoding= 'utf-8' )
df. head( 3 )
0 1 2 3 0 2020-02-02 22 100000.9 20000.6 1 2020-02-03 18 20800.8 10000.3 2 2020-02-02 32 508008.8 10300.4
df. columns = [ 'Date' , 'Age' , 'Income' , 'Expense' ]
df. head( 3 )
Date Age Income Expense 0 2020-02-02 22 100000.9 20000.6 1 2020-02-03 18 20800.8 10000.3 2 2020-02-02 32 508008.8 10300.4
从内存中读取数据
data_list = [
[ '2020-02-02' , 22 , '100000.9' , 20000.6 ] ,
[ '2020-02-03' , 18 , '20800.8' , 10000.3 ] ,
[ '2020-02-02' , 32 , '508008.8' , 10300.4 ] ,
]
df = pd. DataFrame( data_list, columns= [ 'Date' , 'Age' , 'Income' , 'Expense' ] , index= [ 'A' , 'B' , 'C' ] )
df. head( )
Date Age Income Expense A 2020-02-02 22 100000.9 20000.6 B 2020-02-03 18 20800.8 10000.3 C 2020-02-02 32 508008.8 10300.4
print ( df. dtypes)
print ( list ( map ( lambda x: x. dtype, [ df[ col] for col in df. columns] ) ) )
print ( list ( map ( lambda x: x[ 1 ] . dtype, df. iteritems( ) ) ) )
Date object
Age int64
Income object
Expense float64
dtype: object
[dtype('O'), dtype('int64'), dtype('O'), dtype('float64')]
[dtype('O'), dtype('int64'), dtype('O'), dtype('float64')]
df[ 'Income' ] = df[ 'Income' ] . astype( np. float64)
df. dtypes
Date object
Age int64
Income float64
Expense float64
dtype: object
data_dict = {
'A' : [ 1 , 2 , 3 ] ,
'B' : [ 4 , 5 , 6 ] ,
'C' : [ 7 , 8 , 9 ]
}
df = pd. DataFrame( data_dict)
df. head( )
数据访问
遍历DataFrame中的每一行和每一列
df = pd. DataFrame(
[
[ 1 , 2 , 3 ] ,
[ 4 , 5 , 6 ] ,
[ 7 , 8 , 9 ]
] ,
columns= [ 'A' , 'B' , 'C' ] ,
index= [ 'a' , 'b' , 'c' ]
)
for label, content in df. iteritems( ) :
print ( 'label: {}' . format ( label) )
print ( 'content:\n{}' . format ( content) )
print ( 'content (without index):\n{}' . format ( content. to_string( index= False , header= False ) ) )
break
label: A
content:
a 1
b 4
c 7
Name: A, dtype: int64
content (without index):
1
4
7
for label, content in df. iterrows( ) :
print ( 'label: {}' . format ( label) )
print ( 'content:\n{}' . format ( content) )
print ( 'content (without index):\n{}' . format ( content. to_string( index= False , header= False ) . replace( '\n' , ' ' ) ) )
break
label: a
content:
A 1
B 2
C 3
Name: a, dtype: int64
content (without index):
1 2 3
for tp in df. itertuples( index= True , name= 'Pandas' ) :
print ( tp)
print ( getattr ( tp, 'A' ) , getattr ( tp, 'B' ) , getattr ( tp, 'C' ) )
Pandas(Index='a', A=1, B=2, C=3)
1 2 3
Pandas(Index='b', A=4, B=5, C=6)
4 5 6
Pandas(Index='c', A=7, B=8, C=9)
7 8 9
行列操作
参考: pandas 删除行,删除列,增加行,增加列
增加行
使用 DataFrame 的 append
方法增加 pd.Series
作为新的一行, 注意一般 append
中需要使用 ignore_index=True
参数; 使用 DataFrame 的 loc
或 at
方法来增加新的行. 逐行增加 插入行
df = pd. DataFrame( np. arange( 9 ) . reshape( 3 , 3 ) , index= list ( 'abc' ) , columns= list ( 'ABC' ) )
new_row = pd. Series( [ 9 , 0 , 1 ] , index= df. columns, name= 4 )
df = df. append( new_row, ignore_index= True )
df. head( )
df = pd. DataFrame( np. arange( 9 ) . reshape( 3 , 3 ) , index= list ( 'abc' ) , columns= list ( 'ABC' ) )
df. loc[ 4 ] = [ 9 , 0 , 1 ]
df. at[ 5 ] = [ 0 , 9 , 1 ]
df. head( )
A B C a 0.0 1.0 2.0 b 3.0 4.0 5.0 c 6.0 7.0 8.0 4 9.0 0.0 1.0 5 0.0 9.0 1.0
df = pd. DataFrame( np. arange( 9 ) . reshape( 3 , 3 ) , index= list ( 'abc' ) , columns= list ( 'ABC' ) )
df. loc[ len ( df) ] = [ 5 , 6 , 7 ]
df. head( )
df = pd. DataFrame( np. arange( 9 ) . reshape( 3 , 3 ) , index= list ( 'abc' ) , columns= list ( 'ABC' ) )
df = df. reindex( index= df. index. insert( 2 , 4 ) )
df. loc[ 4 ] = [ 5 , 6 , 6 ]
df. head( )
A B C a 0.0 1.0 2.0 b 3.0 4.0 5.0 4 5.0 6.0 6.0 c 6.0 7.0 8.0
删除行
使用 drop
, 其中 axis
参数用于区别删除行还是删除列. axis
默认为 0.
推荐查阅 drop_duplicates()
用于去重.
df = pd. DataFrame( np. arange( 9 ) . reshape( 3 , 3 ) , index= list ( 'abc' ) , columns= list ( 'ABC' ) )
df. drop( df. index[ [ 0 , 2 ] ] , axis= 0 , inplace= True )
df. head( )
df = pd. DataFrame( np. arange( 9 ) . reshape( 3 , 3 ) , index= list ( 'abc' ) , columns= list ( 'ABC' ) )
df. drop( [ 'a' , 'b' ] , inplace= True )
df. head( )
增加列
遍历 DataFrame 获取序列 (不增加列) 通过 [ ] 或 loc 增加列 (常用) 通过 Insert 来增加列
df = pd. DataFrame( np. arange( 9 ) . reshape( 3 , 3 ) , index= list ( 'abc' ) , columns= list ( 'ABC' ) )
s = [ a + c for a, c in zip ( df[ 'A' ] , df[ 'C' ] ) ]
print ( s)
s = [ row[ 'A' ] + row[ 'C' ] for i, row in df. iterrows( ) ]
print ( s)
s = df. apply ( lambda row: row[ 'A' ] + row[ 'C' ] , axis= 1 )
print ( s)
s = df[ 'A' ] + df[ 'C' ]
print ( s)
s = df[ 'A' ] . values + df[ 'C' ] . values
print ( s)
[2, 8, 14]
[2, 8, 14]
a 2
b 8
c 14
dtype: int64
a 2
b 8
c 14
dtype: int64
[ 2 8 14]
df = pd. DataFrame( np. arange( 9 ) . reshape( 3 , 3 ) , index= list ( 'abc' ) , columns= list ( 'ABC' ) )
df[ 'D' ] = df[ 'A' ] + df[ 'B' ]
df. loc[ : , 'E' ] = df[ 'A' ] + df[ 'B' ]
df. head( )
A B C D E a 0 1 2 1 1 b 3 4 5 7 7 c 6 7 8 13 13
df = pd. DataFrame( np. arange( 9 ) . reshape( 3 , 3 ) , index= list ( 'abc' ) , columns= list ( 'ABC' ) )
df. insert( 2 , 'D' , [ 3 , 4 , 5 ] )
df. head( )
删除列
使用 drop
, 其中 axis
参数用于区别删除行还是删除列.
df = pd. DataFrame( np. arange( 9 ) . reshape( 3 , 3 ) , index= list ( 'abc' ) , columns= list ( 'ABC' ) )
df. drop( df. columns[ [ 0 , 2 ] ] , axis= 1 , inplace= True )
df. head( )
df = pd. DataFrame( np. arange( 9 ) . reshape( 3 , 3 ) , index= list ( 'abc' ) , columns= list ( 'ABC' ) )
df. drop( [ 'A' , 'C' ] , axis= 1 , inplace= True )
df. head( )
数据分组 Groupby
data = np. array( [ [ '2020-02-02' , 18 , 100000.9 , 20000.6 ] ,
[ '2020-02-03' , 18 , 20800.8 , 10000.3 ] ,
[ '2020-02-02' , 20 , 508008.8 , 10300.4 ] ,
[ '2020-02-02' , 18 , 332342.8 , 101240.3 ] ,
[ '2020-02-03' , 18 , 332344.6 , 101240.5 ] ,
[ '2020-02-04' , 20 , 134364.2 , 11240.5 ] ,
[ '2020-02-03' , 20 , 254354.8 , 11240.0 ] ,
[ '2020-02-04' , 18 , 254234.2 , 1140.0 ] ] , dtype= object )
df = pd. DataFrame( data, columns= [ 'Date' , 'Age' , 'Income' , 'Expense' ] )
df. head( 3 )
Date Age Income Expense 0 2020-02-02 18 100001 20000.6 1 2020-02-03 18 20800.8 10000.3 2 2020-02-02 20 508009 10300.4
t = df. groupby( [ 'Date' ] ) . sum ( )
t. head( )
Age Income Expense Date 2020-02-02 56 940352.5 131541.3 2020-02-03 56 607500.2 122480.8 2020-02-04 38 388598.4 12380.5
t = df. groupby( [ 'Date' , 'Age' ] ) . sum ( )
t. head( )
Income Expense Date Age 2020-02-02 18 432343.7 121240.9 20 508008.8 10300.4 2020-02-03 18 353145.4 111240.8 20 254354.8 11240.0 2020-02-04 18 254234.2 1140.0
将 Groupby 对象转换为 DataFrame 对象
通过上面的例子我们发现, 使用 groupby 得到 Groupby 对象, 但有些场景下我们需要的是 DataFrame 对象, 此时需要给 groupby
增加 as_index=False
的参数.
t = df. groupby( [ 'Date' , 'Age' ] , as_index= False ) . sum ( )
t. head( )
Date Age Income Expense 0 2020-02-02 18 432343.7 121240.9 1 2020-02-02 20 508008.8 10300.4 2 2020-02-03 18 353145.4 111240.8 3 2020-02-03 20 254354.8 11240.0 4 2020-02-04 18 254234.2 1140.0
个性化需求
打印 DataFrame 但不输出 row number/index
参考: https://stackoverflow.com/questions/52396477/printing-a-pandas-dataframe-without-row-number-index
df = pd. DataFrame(
[
[ 1 , 2 , 3 ] ,
[ 4 , 5 , 6 ] ,
[ 7 , 8 , 9 ]
] ,
columns= [ 'A' , 'B' , 'C' ] ,
index= [ 'a' , 'b' , 'c' ]
)
print ( df. to_string( index= False , header= False ) )
print ( df. values. tolist( ) )
print ( '\n' . join( [ ' ' . join( map ( str , row) ) for row in df. values. tolist( ) ] ) )
1 2 3
4 5 6
7 8 9
[[1, 2, 3], [4, 5, 6], [7, 8, 9]]
1 2 3
4 5 6
7 8 9