该代码是总结的一些pandas用法,方法暂时不全,后续会继续添加,其中代码主要参考了pandas中文文档和B站视频。
https://www.bilibili.com/video/BV1k94y1975j?vd_source=7771577bd8c0c69d43ee27a1c1ac8a1a
import pandas as pd
import numpy as np
转为DataFrame
list = {
'name' : [ 'a' , 'b' , 'c' , 'd' , 'e' ] ,
'age' : [ 11 , 23 , 55 , 31 , 11 ] ,
'sex' : [ 1 , 0 , 1 , 1 , 0 ]
}
pd_list = pd. DataFrame( list )
pd_list
name age sex 0 a 11 1 1 b 23 0 2 c 55 1 3 d 31 1 4 e 11 0
获取某一行
name_a = pd_list[ pd_list[ 'name' ] == 'a' ]
name_a
pd_list. loc[ 3 ]
name d
age 31
sex 1
Name: 3, dtype: object
获取列名和行名
pd_list. columns
Index(['name', 'age', 'sex'], dtype='object')
pd_list. index
RangeIndex(start=0, stop=5, step=1)
更改列名
pd_list. rename( columns= { 'age' : 'Age' } , inplace= True )
pd_list
name Age sex 0 a 11 1 1 b 23 0 2 c 55 1 3 d 31 1 4 e 11 0
统计某列相同内容出现次数
pd_list[ 'Age' ] . value_counts( )
11 2
23 1
55 1
31 1
Name: Age, dtype: int64
根据条件获取某列的值
pd_list[ pd_list[ 'Age' ] > 30 ]
计数某列平均值
pd_list[ 'Age' ] . mean( )
26.2
panda转为列表
age_list = pd_list[ 'Age' ] . to_list( )
age_list
[11, 23, 55, 31, 11]
去除某列重复值
list = {
'name' : [ 'a' , 'b' , 'c' , 'd' , 'e' , 'f' , 'g' , 'h' ] ,
'age' : [ 11 , 23 , 55 , 31 , 11 , 11 , np. nan, 34 ] ,
'sex' : [ 1 , 0 , 1 , 1 , 0 , 1 , 1 , 0 ]
}
pd_list = pd. DataFrame( list )
pd_list
name age sex 0 a 11.0 1 1 b 23.0 0 2 c 55.0 1 3 d 31.0 1 4 e 11.0 0 5 f 11.0 1 6 g NaN 1 7 h 34.0 0
pd_list. drop_duplicates( [ 'age' ] )
name age sex 0 a 11.0 1 1 b 23.0 0 2 c 55.0 1 3 d 31.0 1 6 g NaN 1 7 h 34.0 0
保存到Excel
pd_list. to_excel( 'list.xlsx' )
查看数据列数行数
pd_list. shape
(8, 3)
pd_list. shape[ 0 ]
8
pd_list. shape[ 1 ]
3
交换两列位置
pd_list
name age sex 0 a 11.0 1 1 b 23.0 0 2 c 55.0 1 3 d 31.0 1 4 e 11.0 0 5 f 11.0 1 6 g NaN 1 7 h 34.0 0
swa = pd_list. columns[ [ 2 , 1 ] ]
pd_listd = pd_list[ swa]
pd_list
name age sex 0 a 11.0 1 1 b 23.0 0 2 c 55.0 1 3 d 31.0 1 4 e 11.0 0 5 f 11.0 1 6 g NaN 1 7 h 34.0 0
找到限定条件的某行
pd_list[ pd_list[ 'age' ] == pd_list[ 'age' ] . max ( ) ]
查看前几行和后几行
pd_list. head( 3 )
name age sex 0 a 11.0 1 1 b 23.0 0 2 c 55.0 1
pd_list. tail( 3 )
name age sex 5 f 11.0 1 6 g NaN 1 7 h 34.0 0
删除某行
pd_list. drop( index= 3 , inplace= True )
pd_list
name age sex 0 a 11.0 1 1 b 23.0 0 2 c 55.0 1 4 e 11.0 0 5 f 11.0 1 6 g NaN 1 7 h 34.0 0
添加一行数据 有问题
pd_list. loc[ len ( pd_list. index) ] = [ 'ww' , 17 , 0 ]
pd_list
name age sex 0 a 11.0 1 1 b 23.0 0 2 c 55.0 1 3 d 31.0 1 4 e 11.0 0 5 f 11.0 1 6 g NaN 1 7 h 34.0 0 8 ww 17.0 0
pd_list = pd_list. append( { 'name' : 'kw' , 'age' : 15 , 'sex' : 1 } , ignore_index = True )
pd_list
C:\AppData\AppData\Local\Temp\ipykernel_13232\1113817121.py:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
pd_list = pd_list.append({'name':'kw','age':15,'sex':1}, ignore_index = True)
name age sex 0 a 11.0 1 1 b 23.0 0 2 c 55.0 1 3 d 31.0 1 4 e 11.0 0 5 f 11.0 1 6 g NaN 1 7 h 34.0 0 8 ww 17.0 0 9 kw 15.0 1
pd_list = pd_list. insert( { 'name' : 'kw' , 'age' : 15 , 'sex' : 1 } , ignore_index = True )
pd_list
以某列为标准进行排序
pd = pd_list. sort_values( 'age' )
pd
name age sex 0 a 11.0 1 4 e 11.0 0 5 f 11.0 1 1 b 23.0 0 7 h 34.0 0 2 c 55.0 1 6 g NaN 1
pd = pd_list. sort_values( 'age' , ascending= False )
pd
name age sex 2 c 55.0 1 7 h 34.0 0 1 b 23.0 0 0 a 11.0 1 4 e 11.0 0 5 f 11.0 1 6 g NaN 1
读取本地文件
frame = pd. read_excel( '路径/文件名' )
frame = pd. read_csv( '路径/文件名' )
查看索引,数据类型,内存,数据列汇总,每列数据类型
pd_list. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7 entries, 0 to 7
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 7 non-null object
1 age 6 non-null float64
2 sex 7 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 224.0+ bytes
pd_list. describe( )
age sex count 6.000000 7.000000 mean 24.166667 0.571429 std 17.713460 0.534522 min 11.000000 0.000000 25% 11.000000 0.000000 50% 17.000000 1.000000 75% 31.250000 1.000000 max 55.000000 1.000000
pd_list. dtypes
name object
age float64
sex int64
dtype: object
添加删除某一列
pd_list[ 'new' ] = 1
pd_list
name age sex new 0 a 11.0 1 1 1 b 23.0 0 1 2 c 55.0 1 1 3 d 31.0 1 1 4 e 11.0 0 1 5 f 11.0 1 1 6 g NaN 1 1 7 h 34.0 0 1
del pd_list[ 'new' ]
pd_list
name age sex 0 a 11.0 1 1 b 23.0 0 2 c 55.0 1 3 d 31.0 1 4 e 11.0 0 5 f 11.0 1 6 g NaN 1 7 h 34.0 0
合并某两列到新的一列
pd_list[ 'new' ] = pd_list[ 'name' ] . map ( str ) + pd_list[ 'name' ]
pd_list
name age sex new 0 a 11.0 1 aa 1 b 23.0 0 bb 2 c 55.0 1 cc 3 d 31.0 1 dd 4 e 11.0 0 ee 5 f 11.0 1 ff 6 g NaN 1 gg 7 h 34.0 0 hh
del pd_list[ 'new' ]
pd_list
name age sex 0 a 11.0 1 1 b 23.0 0 2 c 55.0 1 3 d 31.0 1 4 e 11.0 0 5 f 11.0 1 6 g NaN 1 7 h 34.0 0
查看某列最大值和最小值,以及差
pd_list[ 'age' ] . max ( )
55.0
pd_list[ 'age' ] . min ( )
11.0
pd_list[ 'age' ] . max ( ) - pd_list[ 'age' ] . min ( )
44.0
pd_list[ [ 'age' ] ] . apply ( lambda x: x. max ( ) - x. min ( ) )
age 44.0
dtype: float64
设置索引
pd_list. set_index( 'name' )
age sex name a 11.0 1 b 23.0 0 c 55.0 1 d 31.0 1 e 11.0 0 f 11.0 1 g NaN 1 h 34.0 0
用loc和iloc进行选取
pd_list
name age sex 0 a 11.0 1 1 b 23.0 0 2 c 55.0 1 3 d 31.0 1 4 e 11.0 0 5 f 11.0 1 6 g NaN 1 7 h 34.0 0
pd_list. loc[ 3 ]
name d
age 31.0
sex 1
Name: 3, dtype: object
pd_list. iloc[ 3 ]
name d
age 31.0
sex 1
Name: 3, dtype: object
pd_list. loc[ [ i for i in range ( 0 , 3 ) ] , [ 'name' , 'age' ] ]
name age 0 a 11.0 1 b 23.0 2 c 55.0
pd_list. loc[ [ 3 , 5 ] , [ 'name' , 'sex' ] ]
pd_list. loc[ 1 : 5 , [ 'name' , 'sex' ] ]
pd_list. iloc[ [ 3 , 5 ] , [ 0 , 2 ] ]