Pandas快速总结

Series数据结构

# 带标签的一列
import pandas as pd;

a = pd.Series( [1,2,3,4,5]);
a
0    1
1    2
2    3
3    4
4    5
dtype: int64
# 传入index
a = pd.Series([1,2,3,4,5], index=['a', 'b', 'c', 'd', 'e'], dtype=float);
a
a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
dtype: float64
# 从numpy    ndarray导入 
import numpy as np;

a = np.arange(5);
b = pd.Series(a);
print(b)
print(type(a))
0    0
1    1
2    2
3    3
4    4
dtype: int32
<class 'numpy.ndarray'>
# 从dict产生
dic = {'name':'Lee', 'sex':'man', 'age':18}
a = pd.Series(dic)
print(a)
age      18
name    Lee
sex     man
dtype: object
# 手动传入一个索引的时候 以索引为准, 从字典中查找,找不到就直接NaN
my_dict = {'name':'xing', 'sex':'man', 'age':18};
a = pd.Series(my_dict, index = ['name', 'color'])
a
name     xing
color     NaN
dtype: object
# 索引的数量多于 数据内容的时候会自动填充
a = pd.Series(5, [0, 1, 2])
a
0    5
1    5
2    5
dtype: int64
a = pd.Series([3], [0, 1, 2])
a
0    3
1    3
2    3
dtype: int64

DataFrame数据结构

# 从numpy 导入
a = np.random.randint(0, 10, (2,3))
df = pd.DataFrame(a, index=['a', 'b'], columns = ['x', 'y', 'z']);
df
xyz
a432
b569
# 把Series变为DF
# 一维的字典不能直接转
population = {'beijing':3434, 'shanghai':2343, 'guangzhou':11232};
s = pd.Series(population);
df = pd.DataFrame(s);
df
0
beijing3434
guangzhou11232
shanghai2343
type(df)
pandas.core.frame.DataFrame
# 还是从series导入,但是加上列名
df = pd.DataFrame(s, columns=['pop_num'])
df
pop_num
beijing3434
guangzhou11232
shanghai2343
# 字典的字典就可以构建df了
popu = {'bj':9898, 'sh':89887, 'gz':11232}
df = pd.DataFrame({'gdp':popu})  # 字典的字典
df
gdp
bj9898
gz11232
sh89887
gdp = {'bj':0.998, 'sh':0.889, 'gz':1.232}
df = pd.DataFrame({'gdp':gdp, 'popu':popu})
df
gdppopu
bj0.9989898
gz1.23211232
sh0.88989887
# 他会自动扩充
# 单一数值会被自动扩充
df = pd.DataFrame({ 'gdp': gdp, 'popu':popu, 'country':'China'})
df
countrygdppopu
bjChina0.9989898
gzChina1.23211232
shChina0.88989887

pandas里面数据的属性

# values属性转为numpy的array数据
df = pd.DataFrame({'gdp':gdp, 'popu':popu});
df
gdppopu
bj0.9989898
gz1.23211232
sh0.88989887
df.values
array([[  9.98000000e-01,   9.89800000e+03],
       [  1.23200000e+00,   1.12320000e+04],
       [  8.89000000e-01,   8.98870000e+04]])
# values属性转为numpy的array数据
df = pd.DataFrame({'gdp':gdp, 'popu':popu, 'country':"China"});
df
countrygdppopu
bjChina0.9989898
gzChina1.23211232
shChina0.88989887
df.values #计算速度更快
array([['China', 0.998, 9898],
       ['China', 1.232, 11232],
       ['China', 0.889, 89887]], dtype=object)
df.index
Index(['bj', 'gz', 'sh'], dtype='object')
df.columns
Index(['country', 'gdp', 'popu'], dtype='object')
df.shape
(3, 3)
df.dtypes
country     object
gdp        float64
popu         int64
dtype: object
df.size
9

索引查找数据

df = pd.DataFrame({'gdp':gdp, 'popu':popu});
df
gdppopu
bj0.9989898
gz1.23211232
sh0.88989887
# 取一列
df['gdp']
bj    0.998
gz    1.232
sh    0.889
Name: gdp, dtype: float64
df.gdp # 对于上面方法的简写
bj    0.998
gz    1.232
sh    0.889
Name: gdp, dtype: float64
# 取一行
df.loc['sh']
gdp         0.889
popu    89887.000
Name: sh, dtype: float64
df.loc[ ['sh', 'bj']] #取多行需要传入列表
gdppopu
sh0.88989887
bj0.9989898
df.loc[ 'bj':'gz'] # 切片可以取到左闭右闭的索引的 一个表格
gdppopu
bj0.9989898
gz1.23211232
# 用位置拿去
df.iloc[ 0]
gdp        0.998
popu    9898.000
Name: bj, dtype: float64
df.iloc[ [0, 2]]
gdppopu
bj0.9989898
sh0.88989887
df.loc['sh', 'gdp'] #精确到一个cell
0.88900000000000001
# iloc 取一个cell
df.iloc[ 0, 1]
9898
# 专为 ndarray之后再取数
df.values[0][1]
9898.0
# 1: 1到最后  :表示所有的列
df.iloc[ 1:, :]
gdppopu
gz1.23211232
sh0.88989887
df.gdp > 0
bj    True
gz    True
sh    True
Name: gdp, dtype: bool
df.gdp > 0.9
bj     True
gz     True
sh    False
Name: gdp, dtype: bool
df.loc[ df.gdp>0.9] # 用bool变量来筛选
gdppopu
bj0.9989898
gz1.23211232
df[ df.gdp>0.9]
gdppopu
bj0.9989898
gz1.23211232

DF里面的赋值

df.iloc[ 0, 1] = 0 #修改cell
df
gdppopu
bj0.9980
gz1.23211232
sh0.88989887
new_column = pd.Series(['010','020','0755'], index=['bj', 'sh','gz']);
new_column
bj     010
sh     020
gz    0755
dtype: object
df['tel'] = new_column #增加一列
df
gdppoputel
bj0.9980010
gz1.232112320755
sh0.88989887020

查看数据的基本特征

dates = pd.date_range('2020-1-1', periods=6)
dates
DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06'],
              dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randint(0, 10, (6,5)), index = dates, columns=list('ABCDE'))
df
ABCDE
2020-01-0149888
2020-01-0245587
2020-01-0310320
2020-01-0495960
2020-01-0578380
2020-01-0643108
df.describe() # 对于每一列的描述,看整体结构
ABCDE
count6.0000006.0000006.0000006.0000006.000000
mean4.8333335.0000004.8333335.3333333.833333
std2.7868743.2863353.1251673.5023804.215052
min1.0000000.0000001.0000000.0000000.000000
25%4.0000003.5000003.0000003.0000000.000000
50%4.0000005.0000004.0000007.0000003.500000
75%6.2500007.2500007.2500008.0000007.750000
max9.0000009.0000009.0000008.0000008.000000
df.info() # 描述信息
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2020-01-01 to 2020-01-06
Freq: D
Data columns (total 5 columns):
A    6 non-null int32
B    6 non-null int32
C    6 non-null int32
D    6 non-null int32
E    6 non-null int32
dtypes: int32(5)
memory usage: 168.0 bytes
df.head(1) # 前几行
ABCDE
2020-01-0149888
df.tail(2)
ABCDE
2020-01-0578380
2020-01-0643108
df.T # 转置
2020-01-01 00:00:002020-01-02 00:00:002020-01-03 00:00:002020-01-04 00:00:002020-01-05 00:00:002020-01-06 00:00:00
A441974
B950583
C853931
D882680
E870008
df.sort_index() #默认 按照行索引升序
ABCDE
2020-01-0149888
2020-01-0245587
2020-01-0310320
2020-01-0495960
2020-01-0578380
2020-01-0643108
df.sort_index(ascending=False) #默认 按照行索引排序
ABCDE
2020-01-0643108
2020-01-0578380
2020-01-0495960
2020-01-0310320
2020-01-0245587
2020-01-0149888
df.sort_index(axis=1, ascending=False) # 按照列索引 降序  按照索引排序
EDCBA
2020-01-0188894
2020-01-0278554
2020-01-0302301
2020-01-0406959
2020-01-0508387
2020-01-0680134
# 按照值排序
df.sort_values('B') #默认是按照某一列的值 对各个行排序
ABCDE
2020-01-0310320
2020-01-0643108
2020-01-0245587
2020-01-0495960
2020-01-0578380
2020-01-0149888
df.sort_values(dates[0], axis=1)
ACDEB
2020-01-0148889
2020-01-0245875
2020-01-0313200
2020-01-0499605
2020-01-0573808
2020-01-0641083

数据计算

a = pd.DataFrame([1, 2, 3])
a
0
01
12
23
a-2
0
0-1
10
21
b = pd.DataFrame([1,3,4])
a+b
0
02
15
27
a*b
0
01
16
212
b.T
012
0134
a.dot(b.T) #矩阵乘法
012
0134
1268
23912
a = pd.DataFrame(np.random.randint(0, 20, (2,2)), columns=['A', 'B'])
a
AB
0171
1411
b = pd.DataFrame(np.random.randint(0, 20, (3,3)), columns = ['A', 'B', 'C'])
b
ABC
09517
191216
20134
a+b # 有点地方就想加,没有的地方就为NaN
ABC
026.06.0NaN
113.023.0NaN
2NaNNaNNaN
a.add(b, fill_value=11111111) # 先填充到 shape相同再计算
ABC
026.06.011111128.0
113.023.011111127.0
211111111.011111124.011111115.0

缺失值的处理

a = pd.DataFrame(np.arange(9).reshape(3,3))
a
012
0012
1345
2678
a.iloc[ :2, 2] = np.NaN
a
012
001NaN
134NaN
2678.0
# 丢掉 缺失值
a.dropna()
a
012
001NaN
134NaN
2678.0
a.dropna() #按照行丢弃
012
2678.0
a.dropna(axis=1) # 按照列丢弃
01
001
134
267
a.dropna(axis=1, how='all') # 全部缺失才丢弃
012
001NaN
134NaN
2678.0
a.fillna(999) #帮我们修改
012
001999.0
134999.0
2678.0

合并和对齐

a = pd.DataFrame(np.zeros((3,4)), columns=['a', 'b', 'c', 'd'])
a
abcd
00.00.00.00.0
10.00.00.00.0
20.00.00.00.0
b = pd.DataFrame(np.zeros( (3,4)), columns=list('abcd'))
b
abcd
00.00.00.00.0
10.00.00.00.0
20.00.00.00.0
# 合并,拼接
pd.concat([a, b]) # 需要传递一个列表进去
abcd
00.00.00.00.0
10.00.00.00.0
20.00.00.00.0
00.00.00.00.0
10.00.00.00.0
20.00.00.00.0
# 让pd帮我们重新索引
pd.concat( [a, b], ignore_index=True)
abcd
00.00.00.00.0
10.00.00.00.0
20.00.00.00.0
30.00.00.00.0
40.00.00.00.0
50.00.00.00.0
# 水平合并
pd.concat( [a, b], axis=1)
abcdabcd
00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.0
20.00.00.00.00.00.00.00.0
pd.concat( [a, b], axis=1, ignore_index=True) # ignoreindex 会帮助我们废弃原来的不好用的列名
01234567
00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.0
20.00.00.00.00.00.00.00.0
# shape不同的时候怎么办
a = pd.DataFrame(np.ones( (3,3)), index=[0, 1, 2], columns=list('abc'))
b = pd.DataFrame(np.ones( (3,3)), index=[2, 3, 4], columns = list('cde'))
pd.concat([a, b])
abcde
01.01.01.0NaNNaN
11.01.01.0NaNNaN
21.01.01.0NaNNaN
2NaNNaN1.01.01.0
3NaNNaN1.01.01.0
4NaNNaN1.01.01.0
pd.concat( [a, b], axis=1)
abccde
01.01.01.0NaNNaNNaN
11.01.01.0NaNNaNNaN
21.01.01.01.01.01.0
3NaNNaNNaN1.01.01.0
4NaNNaNNaN1.01.01.0
# 增加一行的办法
a = pd.DataFrame( np.ones( (3,4)), index=[0, 1, 2], columns=['a', 'b','c','d'])
a
abcd
01.01.01.01.0
11.01.01.01.0
21.01.01.01.0
b = pd.Series([100, 100, 100, 100], index=list('abcd'))
b
a    100
b    100
c    100
d    100
dtype: int64
a.append(b, ignore_index=True) # 添加一行
abcd
01.01.01.01.0
11.01.01.01.0
21.01.01.01.0
3100.0100.0100.0100.0
# 对齐 用merge方法,,会通过调整行的上下, 根据相同的‘列’保证其数值不变,
a = pd.DataFrame([[-1, 1], 
                 [-2,  0]], index=[1, 2], columns=["A", "B"]);
b = pd.DataFrame([[1, 11], 
                 [0,   10]], index=[1, 2], columns= ['B', 'C']);
print(a)
print(b)
   A  B
1 -1  1
2 -2  0
   B   C
1  1  11
2  0  10
pd.merge( a,b)
ABC
0-1111
1-2010
b = pd.DataFrame([[0, 20], 
                  [1, 21]], index=[1,2], columns = ['B', 'C']);
b
BC
1020
2121
a
AB
1-11
2-20
pd.merge(a, b)
ABC
0-1121
1-2020

分组

df = pd.DataFrame({
    'key':list('ABCCBA'),
    'data1':range(6), # range是python自带的 np.arange 是numpy里面的
    'data2':range(20,26)
})
df
data1data2key
0020A
1121B
2222C
3323C
4424B
5525A
groups = df.groupby('key')
groups
<pandas.core.groupby.DataFrameGroupBy object at 0x000002A97C0EACC0>
groups.sum() # 每一组的sum
data1data2
key
A545
B545
C545
groups.data1.sum()  # 针对其中一列 sum
key
A    5
B    5
C    5
Name: data1, dtype: int32
groups.median()
data1data2
key
A2.522.5
B2.522.5
C2.522.5
groups['data1'].mean()#Seies类型的返回值
key
A    2.5
B    2.5
C    2.5
Name: data1, dtype: float64
groups.apply(lambda x:x['data1']/x['data1'].sum())
key   
A    0    0.0
     5    1.0
B    1    0.2
     4    0.8
C    2    0.4
     3    0.6
Name: data1, dtype: float64
def func(x):
    x['data1'] /= x['data1'].sum()
    return x

groups.apply(func)
data1data2
00.020
10.221
20.422
30.623
40.824
51.025
def func(x):
    x['data1'] /= x['data1'].sum()
    return x

df.groupby('key').apply(func)  #归一化,,用这个把简单
data1data2key
00.020A
10.221B
20.422C
30.623C
40.824B
51.025A

数据透视表

import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()
survivedpclasssexagesibspparchfareembarkedclasswhoadult_maledeckembark_townalivealone
003male22.0107.2500SThirdmanTrueNaNSouthamptonnoFalse
111female38.01071.2833CFirstwomanFalseCCherbourgyesFalse
213female26.0007.9250SThirdwomanFalseNaNSouthamptonyesTrue
311female35.01053.1000SFirstwomanFalseCSouthamptonyesFalse
403male35.0008.0500SThirdmanTrueNaNSouthamptonnoTrue
titanic.pivot_table('survived', index='sex', columns='class')#透视表
classFirstSecondThird
sex
female0.9680850.9210530.500000
male0.3688520.1574070.135447
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值