pandas 相关函数

最新推荐文章于 2023-04-11 14:07:43 发布

m0_50889751

最新推荐文章于 2023-04-11 14:07:43 发布

阅读量264

点赞数

文章标签： python

本文链接：https://blog.csdn.net/m0_50889751/article/details/128016465

版权

import pandas as pd
import numpy as np

np.array([1,2,3])

array([1, 2, 3])

#常见函数

np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

np.linspace(2,3,num=5, endpoint=False)

array([2. , 2.2, 2.4, 2.6, 2.8])

np.random.randn(4,4)

array([[ 0.31090058,  1.36645632,  0.86897091,  0.09761096],
       [ 0.35996995,  1.0850638 ,  0.68081139, -1.54973962],
       [ 1.09726152,  0.85757526,  1.00096873,  1.19056036],
       [-0.71615881,  0.25503605,  0.23917234,  0.53867985]])

numpy.random.randint(low, high=None, size=None, dtype=’l’)

np.random.randint(0,4,size=(2,4))

array([[1, 0, 2, 2],
       [2, 3, 3, 1]])

np.zeros(4)

array([0., 0., 0., 0.])

np.zeros(4).reshape(2,2)

array([[0., 0.],
       [0., 0.]])

df = pd.DataFrame({
    '国家':['中国', '美国', '日本'],
    '地区':['亚洲', '北美','亚洲'],
    '人口':['13.97', '3.28','1.26'],
    'GDP':[14.34,21.43,5.08]
})
df

	国家	地区	人口	GDP
0	中国	亚洲	13.97	14.34
1	美国	北美	3.28	21.43
2	日本	亚洲	1.26	5.08

s = pd.Series(np.random.randn(5),index = ['a','b','c','d','e'])
s

a   -0.610498
b   -1.111079
c    1.495161
d    1.233846
e    0.854973
dtype: float64

*df = pd.DataFrame(data, index, columns=None)

*利用Series组成字典

d = {
    'x':pd.Series([1,2,3],index=['a','b','c']),
    'y':pd.Series([1,2,3,4],index=['a','b','c','d'])
}
pd.DataFrame(d)

	x	y
a	1.0	1
b	2.0	2
c	3.0	3
d	NaN	4

建立索引

data = 'https://www.gairuo.com/file/data/dataset/team.xlsx'
df = pd.read_excel(data,index_col='name')
df

	team	Q1	Q2	Q3	Q4
name
Liver	E	89	21	24	64
Arry	C	36	37	37	57
Ack	A	57	60	18	84
Eorge	C	93	96	71	78
Oah	D	65	49	61	86
...	...	...	...	...	...
Gabriel	C	48	59	87	74
Austin7	C	21	31	30	43
Lincoln4	C	98	93	1	20
Eli	E	11	74	58	91
Ben	E	21	43	41	74

100 rows × 5 columns

df = pd.read_excel(data)
df

	name	team	Q1	Q2	Q3	Q4
0	Liver	E	89	21	24	64
1	Arry	C	36	37	37	57
2	Ack	A	57	60	18	84
3	Eorge	C	93	96	71	78
4	Oah	D	65	49	61	86
...	...	...	...	...	...	...
95	Gabriel	C	48	59	87	74
96	Austin7	C	21	31	30	43
97	Lincoln4	C	98	93	1	20
98	Eli	E	11	74	58	91
99	Ben	E	21	43	41	74

100 rows × 6 columns

df.set_index('name')

	team	Q1	Q2	Q3	Q4
name
Liver	E	89	21	24	64
Arry	C	36	37	37	57
Ack	A	57	60	18	84
Eorge	C	93	96	71	78
Oah	D	65	49	61	86
...	...	...	...	...	...
Gabriel	C	48	59	87	74
Austin7	C	21	31	30	43
Lincoln4	C	98	93	1	20
Eli	E	11	74	58	91
Ben	E	21	43	41	74

100 rows × 5 columns

s = pd.Series([i for i in range(100)])
s

0      0
1      1
2      2
3      3
4      4
      ..
95    95
96    96
97    97
98    98
99    99
Length: 100, dtype: int64

df.set_index(s)

	name	team	Q1	Q2	Q3	Q4
0	Liver	E	89	21	24	64
1	Arry	C	36	37	37	57
2	Ack	A	57	60	18	84
3	Eorge	C	93	96	71	78
4	Oah	D	65	49	61	86
...	...	...	...	...	...	...
95	Gabriel	C	48	59	87	74
96	Austin7	C	21	31	30	43
97	Lincoln4	C	98	93	1	20
98	Eli	E	11	74	58	91
99	Ben	E	21	43	41	74

100 rows × 6 columns

重置索引

df.reset_index()

	index	name	team	Q1	Q2	Q3	Q4
0	0	Liver	E	89	21	24	64
1	1	Arry	C	36	37	37	57
2	2	Ack	A	57	60	18	84
3	3	Eorge	C	93	96	71	78
4	4	Oah	D	65	49	61	86
...	...	...	...	...	...	...	...
95	95	Gabriel	C	48	59	87	74
96	96	Austin7	C	21	31	30	43
97	97	Lincoln4	C	98	93	1	20
98	98	Eli	E	11	74	58	91
99	99	Ben	E	21	43	41	74

100 rows × 7 columns

df

	name	team	Q1	Q2	Q3	Q4
0	Liver	E	89	21	24	64
1	Arry	C	36	37	37	57
2	Ack	A	57	60	18	84
3	Eorge	C	93	96	71	78
4	Oah	D	65	49	61	86
...	...	...	...	...	...	...
95	Gabriel	C	48	59	87	74
96	Austin7	C	21	31	30	43
97	Lincoln4	C	98	93	1	20
98	Eli	E	11	74	58	91
99	Ben	E	21	43	41	74

100 rows × 6 columns

df.index

RangeIndex(start=0, stop=100, step=1)

df.reset_index(drop=True)

	name	team	Q1	Q2	Q3	Q4
0	Liver	E	89	21	24	64
1	Arry	C	36	37	37	57
2	Ack	A	57	60	18	84
3	Eorge	C	93	96	71	78
4	Oah	D	65	49	61	86
...	...	...	...	...	...	...
95	Gabriel	C	48	59	87	74
96	Austin7	C	21	31	30	43
97	Lincoln4	C	98	93	1	20
98	Eli	E	11	74	58	91
99	Ben	E	21	43	41	74

100 rows × 6 columns

df.index

RangeIndex(start=0, stop=100, step=1)

df.head()

	name	team	Q1	Q2	Q3	Q4
0	Liver	E	89	21	24	64
1	Arry	C	36	37	37	57
2	Ack	A	57	60	18	84
3	Eorge	C	93	96	71	78
4	Oah	D	65	49	61	86

dataframe.reset_index(drop=True, inplace=True)

df.tail()

	name	team	Q1	Q2	Q3	Q4
95	Gabriel	C	48	59	87	74
96	Austin7	C	21	31	30	43
97	Lincoln4	C	98	93	1	20
98	Eli	E	11	74	58	91
99	Ben	E	21	43	41	74

df.columns

Index(['name', 'team', 'Q1', 'Q2', 'Q3', 'Q4'], dtype='object')

df.describe()

	Q1	Q2	Q3	Q4
count	100.000000	100.000000	100.000000	100.000000
mean	49.200000	52.550000	52.670000	52.780000
std	29.962603	29.845181	26.543677	27.818524
min	1.000000	1.000000	1.000000	2.000000
25%	19.500000	26.750000	29.500000	29.500000
50%	51.500000	49.500000	55.000000	53.000000
75%	74.250000	77.750000	76.250000	75.250000
max	98.000000	99.000000	99.000000	99.000000

df.mean()

E:\Anaconda\envs\py37\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  """Entry point for launching an IPython kernel.





Q1    49.20
Q2    52.55
Q3    52.67
Q4    52.78
dtype: float64

df.mean(1)

E:\Anaconda\envs\py37\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  """Entry point for launching an IPython kernel.





0     49.50
1     41.75
2     54.75
3     84.50
4     65.25
      ...  
95    67.00
96    31.25
97    53.00
98    58.50
99    44.75
Length: 100, dtype: float64

df.value_counts()

name     team  Q1  Q2  Q3  Q4
Aaron    A     96  75  55  8     1
Liam     B     2   80  24  25    1
Matthew  C     44  33  41  98    1
Mason    D     80  96  26  49    1
Luke6    D     15  97  95  99    1
                                ..
Elliott  B     9   31  33  60    1
Elliot   C     15  17  76  22    1
Elijah   B     97  89  15  46    1
Eli      E     11  74  58  91    1
Zachary  E     12  71  85  93    1
Length: 100, dtype: int64

df

	name	team	Q1	Q2	Q3	Q4
0	Liver	E	89	21	24	64
1	Arry	C	36	37	37	57
2	Ack	A	57	60	18	84
3	Eorge	C	93	96	71	78
4	Oah	D	65	49	61	86
...	...	...	...	...	...	...
95	Gabriel	C	48	59	87	74
96	Austin7	C	21	31	30	43
97	Lincoln4	C	98	93	1	20
98	Eli	E	11	74	58	91
99	Ben	E	21	43	41	74

100 rows × 6 columns

df['team'].value_counts()

C    22
B    22
E    20
D    19
A    17
Name: team, dtype: int64

df.groupby(by = ['team']).count()

	name	Q1	Q2	Q3	Q4
team
A	17	17	17	17	17
B	22	22	22	22	22
C	22	22	22	22	22
D	19	19	19	19	19
E	20	20	20	20	20

diff() : 用于上一个数据和下一个数据的差值
shift()

df

	name	team	Q1	Q2	Q3	Q4
0	Liver	E	89	21	24	64
1	Arry	C	36	37	37	57
2	Ack	A	57	60	18	84
3	Eorge	C	93	96	71	78
4	Oah	D	65	49	61	86
...	...	...	...	...	...	...
95	Gabriel	C	48	59	87	74
96	Austin7	C	21	31	30	43
97	Lincoln4	C	98	93	1	20
98	Eli	E	11	74	58	91
99	Ben	E	21	43	41	74

100 rows × 6 columns

df['Q1'].shift()

0      NaN
1     89.0
2     36.0
3     57.0
4     93.0
      ... 
95    20.0
96    48.0
97    21.0
98    98.0
99    11.0
Name: Q1, Length: 100, dtype: float64

value_counts(values,sort=True, ascending=False, normalize=False,bins=None,dropna=True)

truncate(before= , after= )

Pandas 高级操作

df.Q1>36

0      True
1     False
2      True
3      True
4      True
      ...  
95     True
96    False
97     True
98    False
99    False
Name: Q1, Length: 100, dtype: bool

df.loc[:,'Q1':'Q3']

	Q1	Q2	Q3
0	89	21	24
1	36	37	37
2	57	60	18
3	93	96	71
4	65	49	61
...	...	...	...
95	48	59	87
96	21	31	30
97	98	93	1
98	11	74	58
99	21	43	41

100 rows × 3 columns

df.query()

df.query('Q1>Q2>90')

	name	team	Q1	Q2	Q3	Q4
97	Lincoln4	C	98	93	1	20

df.query('(Q1<50)&(Q2>40)&(Q3>90)')

	name	team	Q1	Q2	Q3	Q4
56	David	B	21	47	99	2
72	Luke6	D	15	97	95	99

df.filter() 支持模糊匹配，正则表达式

df.filter(items=['Q1','Q2'])

	Q1	Q2
0	89	21
1	36	37
2	57	60
3	93	96
4	65	49
...	...	...
95	48	59
96	21	31
97	98	93
98	11	74
99	21	43

100 rows × 2 columns

类型转换astype()

df.Q1.dtypes

dtype('int64')

df.Q1.astype('int32')

0     89
1     36
2     57
3     93
4     65
      ..
95    48
96    21
97    98
98    11
99    21
Name: Q1, Length: 100, dtype: int32

转为时间类型 to_datatime()

t = pd.Series(['20220810','20220811'])
t

0    20220810
1    20220811
dtype: object

pd.to_datetime(t)

0   2022-08-10
1   2022-08-11
dtype: datetime64[ns]

数据排序- sort() axis= 1表示列，axis=0表示行

df.sort_index(ascending=True)# axis, ascending, ignore_index, inplace, by=[]

	name	team	Q1	Q2	Q3	Q4
0	Liver	E	89	21	24	64
1	Arry	C	36	37	37	57
2	Ack	A	57	60	18	84
3	Eorge	C	93	96	71	78
4	Oah	D	65	49	61	86
...	...	...	...	...	...	...
95	Gabriel	C	48	59	87	74
96	Austin7	C	21	31	30	43
97	Lincoln4	C	98	93	1	20
98	Eli	E	11	74	58	91
99	Ben	E	21	43	41	74

100 rows × 6 columns

添加修改

replace(a,b) # 将数据中的a 换成b
insert(列的具体位置，列名，列的值)# 插入新列，列的具体位置，列名，列的值

df.where()

df

	name	team	Q1	Q2	Q3	Q4
0	Liver	E	89	21	24	64
1	Arry	C	36	37	37	57
2	Ack	A	57	60	18	84
3	Eorge	C	93	96	71	78
4	Oah	D	65	49	61	86
...	...	...	...	...	...	...
95	Gabriel	C	48	59	87	74
96	Austin7	C	21	31	30	43
97	Lincoln4	C	98	93	1	20
98	Eli	E	11	74	58	91
99	Ben	E	21	43	41	74

100 rows × 6 columns

df.iterrows()

for index, row in df.iterrows():
    print(index,row['name'],row.Q1)

0 Liver 89
1 Arry 36
2 Ack 57
3 Eorge 93
4 Oah 65
5 Harlie 24
6 Acob 61
7 Lfie 9
8 Reddie 64
9 Oscar 77
10 Leo 17
11 Logan 9
12 Archie 83
13 Theo 51
14 Thomas 80
15 James 48
16 Joshua 63
17 Henry 91
18 William 80
19 Max 97
20 Lucas 60
21 Ethan 79
22 Arthur 44
23 Mason 80
24 Isaac 74
25 Harrison 89
26 Teddy 71
27 Finley 62
28 Daniel 50
29 Riley 35
30 Edward 57
31 Joseph 67
32 Alexander 91
33 Adam 90
34 Reggie1 30
35 Samuel 9
36 Jaxon 88
37 Sebastian 1
38 Elijah 97
39 Harley 2
40 Toby 52
41 Arlo8 48
42 Dylan 86
43 Jude 8
44 Benjamin 15
45 Rory9 8
46 Tommy 29
47 Jake3 69
48 Louie 24
49 Carter7 57
50 Jenson 66
51 Hugo0 28
52 Bobby1 50
53 Frankie 18
54 Ollie3 10
55 Zachary 12
56 David 21
57 Albie1 79
58 Lewis 4
59 Luca 5
60 Ronnie 53
61 Jackson5 6
62 Matthew 44
63 Alex 14
64 Harvey2 43
65 Reuben 70
66 Jayden6 64
67 Caleb 64
68 Hunter3 38
69 Theodore3 43
70 Nathan 87
71 Blake 78
72 Luke6 15
73 Elliot 15
74 Roman 73
75 Stanley 69
76 Dexter 73
77 Michael 89
78 Elliott 9
79 Tyler 75
80 Ryan 92
81 Ellis 34
82 Finn 4
83 Albert0 85
84 Kai 66
85 Liam 2
86 Calum 14
87 Louis2 13
88 Aaron 96
89 Ezra 16
90 Leon 38
91 Connor 62
92 Grayson7 59
93 Jamie0 39
94 Aiden 20
95 Gabriel 48
96 Austin7 21
97 Lincoln4 98
98 Eli 11
99 Ben 21

for column in df:
    print(column)

name
team
Q1
Q2
Q3
Q4

for label, ser in df.items():
    print(label)
    print(ser[:3])

name
0    Liver
1     Arry
2      Ack
Name: name, dtype: object
team
0    E
1    C
2    A
Name: team, dtype: object
Q1
0    89
1    36
2    57
Name: Q1, dtype: int64
Q2
0    21
1    37
2    60
Name: Q2, dtype: int64
Q3
0    24
1    37
2    18
Name: Q3, dtype: int64
Q4
0    64
1    57
2    84
Name: Q4, dtype: int64

apply() 函数

df.name.apply(lambda x: x.lower())

0        liver
1         arry
2          ack
3        eorge
4          oah
        ...   
95     gabriel
96     austin7
97    lincoln4
98         eli
99         ben
Name: name, Length: 100, dtype: object

def mylen(x):
    return len(str(x))
df.applymap(mylen)

	name	team	Q1	Q2	Q3	Q4
0	5	1	2	2	2	2
1	4	1	2	2	2	2
2	3	1	2	2	2	2
3	5	1	2	2	2	2
4	3	1	2	2	2	2
...	...	...	...	...	...	...
95	7	1	2	2	2	2
96	7	1	2	2	2	2
97	8	1	2	2	1	2
98	3	1	2	2	2	2
99	3	1	2	2	2	2

100 rows × 6 columns

agg() # 聚合函数

# 每列最大值
df

	name	team	Q1	Q2	Q3	Q4
0	Liver	E	89	21	24	64
1	Arry	C	36	37	37	57
2	Ack	A	57	60	18	84
3	Eorge	C	93	96	71	78
4	Oah	D	65	49	61	86
...	...	...	...	...	...	...
95	Gabriel	C	48	59	87	74
96	Austin7	C	21	31	30	43
97	Lincoln4	C	98	93	1	20
98	Eli	E	11	74	58	91
99	Ben	E	21	43	41	74

100 rows × 6 columns

df.agg({'Q1':['sum','min'],'Q2':['min','max']})

	Q1	Q2
sum	4920.0	NaN
min	1.0	1.0
max	NaN	99.0

df.Q1.max()

# 使用不同方法进行聚合
df.agg(a=('Q1','max'),
      b=('Q2','min'))

	Q1	Q2
a	98.0	NaN
b	NaN	1.0

Pandas 分组聚合

df.groupby(by=None,axis=0,level=None, as_index: bool =True, sort: bool=True,

group_keys:bool=True,observed: bool = False,

dropna : = False)

df.groupby('team').sum()

	Q1	Q2	Q3	Q4
team
A	1066	639	875	783
B	975	1218	1202	1136
C	1056	1194	1068	1127
D	860	1191	1241	1199
E	963	1013	881	1033

df.groupby('team').count()

	name	Q1	Q2	Q3	Q4
team
A	17	17	17	17	17
B	22	22	22	22	22
C	22	22	22	22	22
D	19	19	19	19	19
E	20	20	20	20	20

df.team.value_counts()

C    22
B    22
E    20
D    19
A    17
Name: team, dtype: int64

df.groupby(lambda x:x>50).sum()

	Q1	Q2	Q3	Q4
False	2832	2705	2597	2687
True	2088	2550	2670	2591

数据合并与对比

    df.append(self, other, ignore_insex=False, verify_integrity=False,sort=False)

pd.concat(objs:连接数据，axis=1, join=‘outer’, ignore_index=False, keys=None, levels=None,names=None,sort=False,copy=True)

df1 = pd.DataFrame({'x':[1,2],'y':[3,4]})
df2 = pd.DataFrame({'x':[5,6],'y':[7,8]})

df1

	x	y
0	1	3
1	2	4

pd.concat([df1,df2])

	x	y
0	1	3
1	2	4
0	5	7
1	6	8

pd.concat([df1,df2],axis=1)

	x	y	x	y
0	1	3	5	7
1	2	4	6	8

z = pd.Series([9,9], name='z')
z

0    9
1    9
Name: z, dtype: int64

df.groupby(['team',df.mean(1)>=60]).count()

E:\Anaconda\envs\py37\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  """Entry point for launching an IPython kernel.

		name	Q1	Q2	Q3	Q4
team
A	False	14	14	14	14	14
A	True	3	3	3	3	3
B	False	14	14	14	14	14
B	True	8	8	8	8	8
C	False	17	17	17	17	17
C	True	5	5	5	5	5
D	False	10	10	10	10	10
D	True	9	9	9	9	9
E	False	15	15	15	15	15
E	True	5	5	5	5	5

arrays = [[1,1,2,2],['A','B','A','B']]
index = pd.MultiIndex.from_arrays(arrays, names=('class','team'))
index

MultiIndex([(1, 'A'),
            (1, 'B'),
            (2, 'A'),
            (2, 'B')],
           names=['class', 'team'])

pd.DataFrame([{'Q1':60,'Q2':70}], index=index)

		Q1	Q2
class	team
1	A	60	70
1	B	60	70
2	A	60	70
2	B	60	70

insex_arrays = [[1,1,2,2],['男','女','男','女']]
columns_arrays = [['2019','2019','2020','2020'],['上半年','下半年','上半年','下半年']]

数据重现与透视

# 构造数据
dff = pd.DataFrame(
{
    'A':['a1','a1','a2','a3','a3','a3'],
    'B':['b1','b2','b3','b1','b2','b3'],
    'C':['c1','c2','c3','c4','c5','c6'],
    'D':['d1','d2','d3','d4','d5','d6']
})
dff

	A	B	C	D
0	a1	b1	c1	d1
1	a1	b2	c2	d2
2	a2	b3	c3	d3
3	a3	b1	c4	d4
4	a3	b2	c5	d5
5	a3	b3	c6	d6

dff.pivot(index='A',columns = 'B', values = 'C')

B	b1	b2	b3
A
a1	c1	c2	NaN
a2	NaN	NaN	c3
a3	c4	c5	c6

pd.pivot_table(df,
index=[],
columns=[],
value=,
aggfunc=np.sum,
fill_value=0,
margins=True)

dff

	A	B	C	D
0	a1	b1	c1	d1
1	a1	b2	c2	d2
2	a2	b3	c3	d3
3	a3	b1	c4	d4
4	a3	b2	c5	d5
5	a3	b3	c6	d6

dff.D=np.arange(1,7)

dff

	A	B	C	D
0	a1	b1	c1	1
1	a1	b2	c2	2
2	a2	b3	c3	3
3	a3	b1	c4	4
4	a3	b2	c5	5
5	a3	b3	c6	6

dff.pivot_table(index='A',columns='B',values='D',fill_value=0)

B	b1	b2	b3
A
a1	1	2	0
a2	0	0	3
a3	4	5	6

聚合高级操作

dff.pivot_table(index=['A','B'],columns=['C'],values='D',aggfunc=np.sum, fill_value=0, margins=True)

	C	c1	c2	c3	c4	c5	c6	All
A	B
a1	b1	1	0	0	0	0	0	1
a1	b2	0	2	0	0	0	0	2
a2	b3	0	0	3	0	0	0	3
a3	b1	0	0	0	4	0	0	4
	b2	0	0	0	0	5	0	5
	b3	0	0	0	0	0	6	6
All		1	2	3	4	5	6	21

dff1 = pd.DataFrame({
    'A':['a1','a1','a2','a2'],
    'B':['b1','b2','b1','b2'],
    'C':[1,2,3,4],
    'D':[5,6,7,8],
    'E':[5,6,7,8]
})

dff1

	A	B	C	D	E
0	a1	b1	1	5	5
1	a1	b2	2	6	6
2	a2	b1	3	7	7
3	a2	b2	4	8	8

dff1.set_index(['A','B'],inplace=True)

dff1

		C	D	E
A	B
a1	b1	1	5	5
a1	b2	2	6	6
a2	b1	3	7	7
a2	b2	4	8	8

dff1.stack()

A   B    
a1  b1  C    1
        D    5
        E    5
    b2  C    2
        D    6
        E    6
a2  b1  C    3
        D    7
        E    7
    b2  C    4
        D    8
        E    8
dtype: int64

s = dff1.stack()

s.unstack()

		C	D	E
A	B
a1	b1	1	5	5
a1	b2	2	6	6
a2	b1	3	7	7
a2	b2	4	8	8

交叉表

dff2 = pd.DataFrame(
{
    'A':['a1','a1','a2','a3','a3','a3'],
    'B':['b1','b2','b3','b1','b2','b3'],
    'C':[1,2,3,4,5,6]
})
dff2

	A	B	C
0	a1	b1	1
1	a1	b2	2
2	a2	b3	3
3	a3	b1	4
4	a3	b2	5
5	a3	b3	6

pd.crosstab(dff2['A'],dff2['B'],normalize=True)

B	b1	b2	b3
A
a1	0.166667	0.166667	0.000000
a2	0.000000	0.000000	0.166667
a3	0.166667	0.166667	0.166667

dff2

	A	B	C
0	a1	b1	1
1	a1	b2	2
2	a2	b3	3
3	a3	b1	4
4	a3	b2	5
5	a3	b3	6

dff2.melt(id_vars=['A','B'])

	A	B	variable	value
0	a1	b1	C	1
1	a1	b2	C	2
2	a2	b3	C	3
3	a3	b1	C	4
4	a3	b2	C	5
5	a3	b3	C	6

dff2.melt()

	variable	value
0	A	a1
1	A	a1
2	A	a2
3	A	a3
4	A	a3
5	A	a3
6	B	b1
7	B	b2
8	B	b3
9	B	b1
10	B	b2
11	B	b3
12	C	1
13	C	2
14	C	3
15	C	4
16	C	5
17	C	6

dff2.melt(value_vars=['C','B'])

	variable	value
0	C	1
1	C	2
2	C	3
3	C	4
4	C	5
5	C	6
6	B	b1
7	B	b2
8	B	b3
9	B	b1
10	B	b2
11	B	b3

dff2.melt(id_vars=['A'],value_vars=['B'],var_name='B_lable',value_name='B_value')

	A	B_lable	B_value
0	a1	B	b1
1	a1	B	b2
2	a2	B	b3
3	a3	B	b1
4	a3	B	b2
5	a3	B	b3

虚拟变量

pd.get_dummies(data,prefix=None,
prefix_sep=’_’,dummy_na=False,
columns=None, sparse=False,
drop_first=False, dtype=None)

# 因子化
data = ['b','b','a','c','b']
codes, unique = pd.factorize(data)
codes
unique

array(['b', 'a', 'c'], dtype=object)

c = pd.Series([[1,2,3],'foo',[],[3,4]])
c

0    [1, 2, 3]
1          foo
2           []
3       [3, 4]
dtype: object

c.explode()

0      1
0      2
0      3
1    foo
2    NaN
3      3
3      4
dtype: object

c1 = pd.Series({'A':[[1,2,3],'foo',[],[3,4]],'B':range(4)})
c1

A    [[1, 2, 3], foo, [], [3, 4]]
B                    (0, 1, 2, 3)
dtype: object

c1.explode()

A    [1, 2, 3]
A          foo
A           []
A       [3, 4]
B            0
B            1
B            2
B            3
dtype: object

c1.explode('A')

0    [1, 2, 3]
1          foo
2           []
3       [3, 4]
4            0
5            1
6            2
7            3
dtype: object

Pandas 数据清洗

da = pd.DataFrame({
    'A':['a1','a1','a2','a2'],
    'B':['b1','b2',None,'b2'],
    'C':[1,2,3,4],
    'D':[5,6,None,8],
    'E':[5,None,7,8]
}
)
da

	A	B	C	D	E
0	a1	b1	1	5.0	5.0
1	a1	b2	2	6.0	NaN
2	a2	None	3	NaN	7.0
3	a2	b2	4	8.0	8.0

da.isna()

	A	B	C	D	E
0	False	False	False	False	False
1	False	False	False	False	True
2	False	True	False	True	False
3	False	False	False	False	False

da.isnull()

	A	B	C	D	E
0	False	False	False	False	False
1	False	False	False	False	True
2	False	True	False	True	False
3	False	False	False	False	False

da.isna().sum()

A    0
B    1
C    0
D    1
E    1
dtype: int64

"""
1 识别重复值
df.duplicated(subset=None,keep='first')
keep : first 第一次出现值为True
       last  最后一次出现值为True
       FALSE 所有重复值为True
2 删除重复值
df.drop_duplicate(subset=None,
                  keep='first',
                  inplace=False,
                  ignore_index = False)
keep : first 保留第一次出现的重复值
       last  保留最后一次出现重复值
       FALSE 删除所有的重复值
"""

"\n1 识别重复值\ndf.duplicated(subset=None,keep='first')\nkeep : first 第一次出现值为True\n       last  最后一次出现值为True\n       FALSE 所有重复值为True\n2 删除重复值\ndf.drop_duplicate(subset=None,\n                  keep='first',\n                  inplace=False,\n                  ignore_index = False)\nkeep : first 保留第一次出现的重复值\n       last  保留最后一次出现重复值\n       FALSE 删除所有的重复值\n"

s1 = pd.Series(['A','Boy','C',np.nan],dtype="string")
s1

0       A
1     Boy
2       C
3    <NA>
dtype: string

s1.str.lower()

0       a
1     boy
2       c
3    <NA>
dtype: string

df

	name	team	Q1	Q2	Q3	Q4
0	Liver	E	89	21	24	64
1	Arry	C	36	37	37	57
2	Ack	A	57	60	18	84
3	Eorge	C	93	96	71	78
4	Oah	D	65	49	61	86
...	...	...	...	...	...	...
95	Gabriel	C	48	59	87	74
96	Austin7	C	21	31	30	43
97	Lincoln4	C	98	93	1	20
98	Eli	E	11	74	58	91
99	Ben	E	21	43	41	74

100 rows × 6 columns

df.Q1.astype(str).str

<pandas.core.strings.accessor.StringMethods at 0x1fdd73e7588>

s2 = pd.Series(['天_地_人','你_我_他',np.nan, '风_水_火'],dtype='string')
s2

0    天_地_人
1    你_我_他
2     <NA>
3    风_水_火
dtype: string

s2.str.split('_')

0    [天, 地, 人]
1    [你, 我, 他]
2         <NA>
3    [风, 水, 火]
dtype: object

s2.str.split('_').str[1]

0       地
1       我
2    <NA>
3       水
dtype: object

s2.str.split('_',expand=True)

	0	1	2
0	天	地	人
1	你	我	他
2	<NA>	<NA>	<NA>
3	风	水	火


pd.Series(['a','b','c']).repeat(2)

0    a
0    a
1    b
1    b
2    c
2    c
dtype: object

pd.Series(['a','b','c']).str.repeat(2)

0    aa
1    bb
2    cc
dtype: object

pd.Series(['a','b','c']).str.repeat([1,2,3])

0      a
1     bb
2    ccc
dtype: object

s3 = pd.Series(['x','y','z'],dtype='string')
s3

0    x
1    y
2    z
dtype: string

s3.str.cat()

'xyz'

s3.str.cat(sep=',')

'x,y,z'

m0_50889751

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
pandas 相关函数

import pandas as pdimport numpy as npnp.array([1,2,3])array([1, 2, 3])#常见函数np.arange(10)array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])np.linspace(2,3,num=5, endpoint=False)array([2. , 2.2, 2.4, 2.6, 2.8])np.random.randn(4,4)array([[ 0.31090058,
复制链接

扫一扫

	Q1	Q2	Q3
0	89	21	24
1	36	37	37
2	57	60	18
3	93	96	71
4	65	49	61
...	...	...	...
95	48	59	87
96	21	31	30
97	98	93	1
98	11	74	58
99	21	43	41

	Q1	Q2
0	89	21
1	36	37
2	57	60
3	93	96
4	65	49
...	...	...
95	48	59
96	21	31
97	98	93
98	11	74
99	21	43

	name	team	Q1	Q2	Q3	Q4
0	5	1	2	2	2	2
1	4	1	2	2	2	2
2	3	1	2	2	2	2
3	5	1	2	2	2	2
4	3	1	2	2	2	2
...	...	...	...	...	...	...
95	7	1	2	2	2	2
96	7	1	2	2	2	2
97	8	1	2	2	1	2
98	3	1	2	2	2	2
99	3	1	2	2	2	2

	C	c1	c2	c3	c4	c5	c6	All
A	B
a1	b1	1	0	0	0	0	0	1
a1	b2	0	2	0	0	0	0	2
a2	b3	0	0	3	0	0	0	3
a3	b1	0	0	0	4	0	0	4
	b2	0	0	0	0	5	0	5
	b3	0	0	0	0	0	6	6
All		1	2	3	4	5	6	21

	variable	value
0	A	a1
1	A	a1
2	A	a2
3	A	a3
4	A	a3
5	A	a3
6	B	b1
7	B	b2
8	B	b3
9	B	b1
10	B	b2
11	B	b3
12	C	1
13	C	2
14	C	3
15	C	4
16	C	5
17	C	6

	Q1	Q2	Q3
0	89	21	24
1	36	37	37
2	57	60	18
3	93	96	71
4	65	49	61
...	...	...	...
95	48	59	87
96	21	31	30
97	98	93	1
98	11	74	58
99	21	43	41

	Q1	Q2
0	89	21
1	36	37
2	57	60
3	93	96
4	65	49
...	...	...
95	48	59
96	21	31
97	98	93
98	11	74
99	21	43

	name	team	Q1	Q2	Q3	Q4
0	5	1	2	2	2	2
1	4	1	2	2	2	2
2	3	1	2	2	2	2
3	5	1	2	2	2	2
4	3	1	2	2	2	2
...	...	...	...	...	...	...
95	7	1	2	2	2	2
96	7	1	2	2	2	2
97	8	1	2	2	1	2
98	3	1	2	2	2	2
99	3	1	2	2	2	2

	C	c1	c2	c3	c4	c5	c6	All
A	B
a1	b1	1	0	0	0	0	0	1
a1	b2	0	2	0	0	0	0	2
a2	b3	0	0	3	0	0	0	3
a3	b1	0	0	0	4	0	0	4
	b2	0	0	0	0	5	0	5
	b3	0	0	0	0	0	6	6
All		1	2	3	4	5	6	21

	variable	value
0	A	a1
1	A	a1
2	A	a2
3	A	a3
4	A	a3
5	A	a3
6	B	b1
7	B	b2
8	B	b3
9	B	b1
10	B	b2
11	B	b3
12	C	1
13	C	2
14	C	3
15	C	4
16	C	5
17	C	6

pandas 相关函数

数据合并与对比

聚合高级操作

交叉表

虚拟变量

“相关推荐”对你有帮助么？

	Q1	Q2	Q3
0	89	21	24
1	36	37	37
2	57	60	18
3	93	96	71
4	65	49	61
...	...	...	...
95	48	59	87
96	21	31	30
97	98	93	1
98	11	74	58
99	21	43	41

	Q1	Q2
0	89	21
1	36	37
2	57	60
3	93	96
4	65	49
...	...	...
95	48	59
96	21	31
97	98	93
98	11	74
99	21	43

	name	team	Q1	Q2	Q3	Q4
0	5	1	2	2	2	2
1	4	1	2	2	2	2
2	3	1	2	2	2	2
3	5	1	2	2	2	2
4	3	1	2	2	2	2
...	...	...	...	...	...	...
95	7	1	2	2	2	2
96	7	1	2	2	2	2
97	8	1	2	2	1	2
98	3	1	2	2	2	2
99	3	1	2	2	2	2

	C	c1	c2	c3	c4	c5	c6	All
A	B
a1	b1	1	0	0	0	0	0	1
a1	b2	0	2	0	0	0	0	2
a2	b3	0	0	3	0	0	0	3
a3	b1	0	0	0	4	0	0	4
	b2	0	0	0	0	5	0	5
	b3	0	0	0	0	0	6	6
All		1	2	3	4	5	6	21

	variable	value
0	A	a1
1	A	a1
2	A	a2
3	A	a3
4	A	a3
5	A	a3
6	B	b1
7	B	b2
8	B	b3
9	B	b1
10	B	b2
11	B	b3
12	C	1
13	C	2
14	C	3
15	C	4
16	C	5
17	C	6