pandas进阶03--高级应用篇

pandas进阶03–高级应用篇

接上一篇:https://blog.csdn.net/sinat_30353259/article/details/80776905
目录介绍:
06、pandas高级应用–数据分割
07、pandas高级应用–数据的过滤和筛选
08、pandas高级应用–读写文件数据
09、pandas高级应用–数据的聚合及分组计算
10、pandas高级应用–分组计算apply

文中用到的数据:链接:https://pan.baidu.com/s/1TEy1iHOlVt_7WYFWm717Sw 密码:mvlt

06、pandas高级应用–数据分割

from pandas import DataFrame,Series
import pandas as pd
import numpy as np


data = np.random.randn(1000)   # 服从正太分布
result = pd.qcut(data,4)       # cut将data数据均分为4组
result

# 统计落在每个区间的元素个数
pd.value_counts(result)
打印结果:
(0.683, 3.121]      250
(0.0452, 0.683]     250
(-0.614, 0.0452]    250
(-3.65, -0.614]     250
dtype: int64


# cut函数分割一组数据  cut计算:找出data中的最大值最小值,之差除以4,得出区间值,然后以这个区间值分四份
data = np.random.randn(20)
# 用cut函数将一组数据分割成n份  precision:保留小数点的有效位数
result = pd.cut(data,4,precision=2)
pd.value_counts(result)
打印结果:
(0.99, 2.11]      9
(-1.25, -0.13]    7
(-0.13, 0.99]     3
(-2.38, -1.25]    1
dtype: int64


data = np.random.randn(1000)
# 根据分位数差的百分比分割
result=pd.qcut(data,[0,0.1,0.5,0.9,1.0])
pd.value_counts(result)
打印结果:
(-0.0402, 1.185]                 400
(-1.352, -0.0402]                400
(1.185, 2.873]                   100
(-2.7689999999999997, -1.352]    100
dtype: int64


# 如果分为数的差值的和小于1的情况    分割的结果,安装分位数差之和计算
result = pd.qcut(data,[0,0.1,0.3,0.75])
pd.value_counts(result)
打印结果:
(-0.562, 0.646]                  450
(-1.352, -0.562]                 200
(-2.7689999999999997, -1.352]    100
dtype: int64

07、pandas高级应用–数据的过滤和筛选

from pandas import DataFrame,Series
import pandas as pd
import numpy as np


data = np.random.randn(1000,4)
df = DataFrame(data)
df

#np.random.seed(num)  num是生成随机数的种子
#np.random.randn()    #默认生成随机数的种子数是当前时间的时间戳
#定义一个种子数
np.random.seed(33)
np.random.rand()
打印结果:
0.24851012743772993

np.random.seed(12345)
data = DataFrame(np.random.randn(1000,4))
data

# 简单的数据统计信息
data.describe()
打印结果:
            0           1           2           3
count   1000.000000 1000.000000 1000.000000 1000.000000
mean    -0.067684   0.067924    0.025598    -0.002298
std 0.998035    0.992106    1.006835    0.996794
min -3.428254   -3.548824   -3.184377   -3.745356
25% -0.774890   -0.591841   -0.641675   -0.644144
50% -0.116401   0.101143    0.002073    -0.013611
75% 0.616366    0.780282    0.680391    0.654328
max 3.366626    2.653656    3.260383    3.927528


# 获取数据的第四列
col = data[3]
# 筛选出绝对值大于3的数
col[np.abs(col)>3]
打印结果:
97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64


# 查找出数据集中有任意某一列的值出现大于3的行数据
data[(np.abs(data)>3).any(1)]
打印结果:
        0           1           2           3
5   -0.539741   0.476985    3.248944    -1.021228
97  -0.774363   0.552936    0.106061    3.927528
102 -0.655054   -0.565230   3.176873    0.959533
305 -2.315555   0.457246    -0.025907   -3.399312
324 0.050188    1.951312    3.260383    0.963301
400 0.146326    0.508391    -0.196713   -3.745356
499 -0.293333   -0.242459   -3.056990   1.918403
523 -3.428254   -0.296336   -0.439938   -0.867165
586 0.275144    1.179227    -3.184377   1.369891
808 -0.362528   -3.548824   1.553205    -2.186301
900 3.366626    -2.372214   0.851010    1.332846


# 将所有绝对值大于3的正数设置为3,绝对值大于3的负数设置为-3
# 获取数据的符号
data[np.abs(data)>3] = np.sign(data)*3
data.describe()
打印结果:
            0           1               2           3
count   1000.000000 1000.000000 1000.000000 1000.000000
mean    -0.067623   0.068473    0.025153    -0.002081
std 0.995485    0.990253    1.003977    0.989736
min -3.000000   -3.000000   -3.000000   -3.000000
25% -0.774890   -0.591841   -0.641675   -0.644144
50% -0.116401   0.101143    0.002073    -0.013611
75% 0.616366    0.780282    0.680391    0.654328
max 3.000000    2.653656    3.000000    3.000000

08、pandas高级应用–读写文件数据

from pandas import DataFrame,Series
import pandas as pd
import numpy as np
import sys


# pandas读取csv文件
data = pd.read_csv('data/ex1.csv')
data
打印结果:
    a   b   c   d   message
0   1   2   3   4   hello
1   5   6   7   8   world
2   9   10  11  12  foo


# 用read_table 读取csv文件   sep:分割符为逗号
pd.read_table('data/ex1.csv',sep=',')
打印结果:
    a   b   c   d   message
0   1   2   3   4   hello
1   5   6   7   8   world
2   9   10  11  12  foo


# 读取的csv文件没有标题,默认分配表头索引
pd.read_csv('data/ex2.csv',header=None)
打印结果:
    0   1   2   3   4
0   1   2   3   4   hello
1   5   6   7   8   world
2   9   10  11  12  foo


# 自定义添加标题
pd.read_csv('data/ex2.csv',names=['a','b','c','e','name'])
打印结果:
    a   b   c   e   name
0   1   2   3   4   hello
1   5   6   7   8   world
2   9   10  11  12  foo


# 指定行索引
pd.read_csv('data/ex2.csv',names=['a','b','c','e','name'],index_col='name')
打印结果:
        a   b   c   e
name                
hello   1   2   3   4
world   5   6   7   8
foo 9   10  11  12


# 将读取的数据进行层次化索引
pd.read_csv('data/csv_mindex.csv',index_col=['key1','key2'])
打印结果:
    value1  value2
key1    key2        
one a   1   2
b   3   4
c   5   6
d   7   8
two a   9   10
b   11  12
c   13  14
d   15  16


# 读取文本文件
list(open('data/ex3.txt'))
打印结果:
['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']


 pd.read_table('data/ex3.txt',sep='\s+')   # \s+ 多个非空字符
 打印结果:

        A           B           C
aaa -0.264438   -1.026059   -0.619500
bbb 0.927272    0.302904    -0.032399
ccc -0.264273   -0.386314   -0.217601
ddd -0.871858   -0.348382   1.100491


# 通过skiprows参数指定需要跳过的行索引
pd.read_csv('data/ex4.csv',skiprows=[0,2,3])
打印结果:
    a   b   c   d   message
0   1   2   3   4   hello
1   5   6   7   8   world
2   9   10  11  12  foo


# 加载存在NaN值缺值数据,把na_values对应的值替换为NaN
pd.read_csv('data/ex5.csv',na_values=3)
打印结果:
something   a   b   c   d   message
0   one 1   2   NaN 4   NaN
1   two 5   6   NaN 8   world
2   three   9   10  11.0    12  foo


sentlines = {'message':['foo','NA'],
            'something':['two']}
打印结果:
something   a   b   c   d   message
0   one 1   2   3.0 4   NaN
1   NaN 5   6   NaN 8   world
2   three   9   10  11.0    12  NaN


#  用pandas将数据写入到csv文件中
df = DataFrame(np.random.randn(4,3),columns=['a','b','c'],index=['one','two','three','four'])
df
打印结果:
        a           b           c
one -0.168265   -0.291455   -0.494909
two 1.358356    -1.056611   -0.672313
three   0.272671    -1.088189   0.365312
four    0.044457    -0.495820   0.737573


# 写入文件
df.to_csv('data/mydata.csv')


data = pd.read_csv('data/ex5.csv')
data
打印结果:
    something   a   b   c   d   message
0   one 1   2   3.0 4   NaN
1   two 5   6   NaN 8   world
2   three   9   10  11.0    12  foo


# 把数据集中的NAN值替换
data.to_csv(sys.stdout,na_rep=['NULL'])
打印结果:
,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


# 只写入数据,不写入行和列的索引
data.to_csv(sys.stdout,index=False,header=False)
data.to_csv('data/aa.csv',index=False,header=False)
打印结果:
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


# 通过columns来指定写入文件中的列
data.to_csv(sys.stdout,index=False,columns=['a','c','message'])
打印结果:
a,c,message
1,3.0,
5,,world
9,11.0,foo

09、pandas高级应用–数据的聚合及分组计算

from pandas import DataFrame,Series
import pandas as pd
import numpy as np


df = DataFrame({"key":['a','a','b','b','a'],
                 "key2":['one','two','one','two','one'],
                 "data1":np.random.randn(5),
                 "data2":np.random.randn(5)})
df
打印结果:
        data1   data2       key key2
0   2.416179    1.114372    a   one
1   0.020374    -0.631952   a   two
2   0.840316    0.076307    b   one
3   0.079715    0.600914    b   two
4   0.284892    -1.005433   a   one

# 选取data1数据列按照key1进行分组
groupd = df["data1"].groupby(df['key'])
# 调用已经分组好的数据中一些方法,即可得出相应的统计结果
# 获取分组中每一组数据的平均值
groupd.mean()
打印结果:
key
a    0.907148
b    0.460016
Name: data1, dtype: float64

# 根据key和key2两个列数据进行分组
grouped = df['data1'].groupby([df['key'],df['key2']])
# 求平均
grouped.mean()
打印结果:
key  key2
a    one     1.350536
     two     0.020374
b    one     0.840316
     two     0.079715
Name: data1, dtype: float64


# 获取分组后每一组中相应元素出现的次数
df.groupby([df['key'],df['key2']]).size()    #或 df.groupby(['key','key2']).size()
key  key2
a    one     2
     two     1
b    one     1
     two     1
dtype: int64
打印结果:


# for  in  循环输出分组的结果
for name,group in df.groupby("key"):
    print(name)
    print(group)
打印结果:
a
      data1     data2 key key2
0  2.416179  1.114372   a  one
1  0.020374 -0.631952   a  two
4  0.284892 -1.005433   a  one
b
      data1     data2 key key2
2  0.840316  0.076307   b  one
3  0.079715  0.600914   b  two

# 将groupby 分类结果转化成字典
pices = dict(list(df.groupby('key')))
pices['b']
打印结果:
    data1       data2       key key2
2   0.840316    0.076307    b   one
3   0.079715    0.600914    b   two


# 按照列的数据类型分组
group = df.groupby(df.dtypes,axis=1)
dict(list(group))
打印结果:
{dtype('float64'):       data1     data2
 0  2.416179  1.114372
 1  0.020374 -0.631952
 2  0.840316  0.076307
 3  0.079715  0.600914
 4  0.284892 -1.005433, dtype('O'):   key key2
 0   a  one
 1   a  two
 2   b  one
 3   b  two
 4   a  one}

 group.size()
 打印结果:
 float64    2
object     2
dtype: int64

# 选择分类数据中的一个或一组
# 方法一:  生成的数据类型是DataFrame
df.groupby(['key','key2'])[['data2']].mean()
打印结果:
key  key2
a    one     0.054469
     two    -0.631952
b    one     0.076307
     two     0.600914
Name: data2, dtype: float64

# 方法二:  生成的类型是series
df['data2'].groupby([df['key'],df['key2']]).mean()

# 方法三:
df.groupby(['key','key2'])['data2'].mean()


# 通过字典或series进行分组
people = DataFrame(np.random.randn(5,5),columns=list('abcde'),index=['Joe','Stenve','Wes','Jim','Travis'])
people
打印结果:
        a           b           c           d           e
Joe 1.440881    -0.673088   1.242920    -1.200689   -2.217625
Stenve  1.264059    0.146833    1.682436    0.026026    -0.271379
Wes 0.486420    -1.100767   0.574013    -0.559674   -0.569250
Jim 0.036516    -1.797862   -1.673278   1.477805    0.187126
Travis  -0.258384   0.542421    -1.283412   0.258634    -0.193361


# 创建一个将列名进行映射的字典
mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_columns = people.groupby(mapping,axis=1)
by_columns.sum()
打印结果:
        blue        red
Joe 0.042231    -1.449832
Stenve  1.708462    1.139513
Wes 0.014339    -1.183597
Jim -0.195473   -1.574220
Travis  -1.024778   0.090675

# 将mapping转化成Series
seriesMap = Series(mapping)
seriesMap
打印结果:
a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object


people.groupby(seriesMap,axis=1).count()
打印结果:
    blue    red
Joe 2   3
Stenve  2   3
Wes 2   3
Jim 2   3
Travis  2   3



# 分组后常用的计算方法
dict_obj = {'key1' : ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'], 
            'key2' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'data1': np.random.randint(1,10, 8), 
            'data2': np.random.randint(1,10, 8)}
df = DataFrame(dict_obj)
# 按照key1进行分组求和
df.groupby('key1').sum()
打印结果:
    data1   data2
key1        
a   17  22
b   10  22


df[['data1']].groupby(df['key1']).sum()
打印结果:
    data1
key1    
a   17
b   10


# 求最大值
df.groupby('key1').max()
打印结果:
data1   data2   key2
key1            
a   7   6   two
b   7   9   two


df.groupby('key1').describe()
打印结果:
data1                                   data2
count   mean    std min 25% 50% 75% max count   mean    std min 25% 50% 75% max
key1                                                                
a   5.0 3.400000    2.880972    1.0 1.0 2.0 6.0 7.0 5.0 4.400000    1.816590    2.0 3.0 5.0 6.0 6.0
b   3.0 3.333333    3.214550    1.0 1.5 2.0 4.5 7.0 3.0 7.333333    2.886751    4.0 6.5 9.0 9.0 9.0


# 第一种方法
# 定义一个函数,求获得DataFrame中某一列数据的最大值和最小值之差
def peak_range(df):
    print(type(df))
    return df.max()-df.min()

df.groupby('key1').agg(peak_range)
打印结果:
    data1   data2
key1        
a   6   4
b   6   5

# 用匿名函数的方式
#  第二种方法
df.groupby('key1').agg(lambda df: df.max()-df.min())


# 使用agg对分组后的数据调用多个聚合函数
df.groupby('key1').agg(['mean','sum','count','min','std',peak_range])     # 带引号的参数是内置聚合函数
打印结果:
data1                                   data2
mean    sum count   min std peak_range  mean    sum count   min std peak_range
key1                                                
a   3.400000    17  5   1   2.880972    6   4.400000    22  5   2   1.816590    4
b   3.333333    10  3   1   3.214550    6   7.333333    22  3   4   2.886751    5

df.groupby('key1').agg(['mean','sum','count','min','std',('ptp',peak_range)])  #peak_range 给peak_range起一个别名,叫ptp


# 使用字典实现数据集的每列作用不同的聚合函数
dict_mapping = {'data1':'mean',
               'data2':'sum'}
df.groupby('key1').agg(dict_mapping)
打印结果:
    data1   data2
key1        
a   3.400000    22
b   3.333333    22


# 以"没有索引形式"返回聚合函数的计算结果
df = pd.DataFrame(data={'books':['bk1','bk1','bk1','bk2','bk2','bk3'], 'price': [12,12,12,15,15,17]}) 
# 将分组之后的books列作为索引
df.groupby("books",as_index=True).sum()
打印结果:
    price
books   
bk1 36
bk2 30
bk3 17


# 不让分组之后的books列作为索引
df.groupby("books",as_index=False).sum()
打印结果:
books   price
0   bk1 36
1   bk2 30
2   bk3 17







10、pandas高级应用–分组计算apply

from pandas import DataFrame,Series
import pandas as pd
import numpy as np

# 加载数据
tips = pd.read_csv('data/tips.csv')
tips

# 为tips数据集添加新的一列,作为客户给的小费占消费总额的百分比
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips

# 定义函数,筛选出小费占比最大的前五条数据
def top(df,n=5,columns='tip_pct'):
    return df.sort_values(by=columns)[-n:]
top(tips,n=6)
打印结果:
    total_bill  tip smoker  day time    size    tip_pct
109 14.31   4.00    Yes Sat Dinner  2   0.279525
183 23.17   6.50    Yes Sun Dinner  4   0.280535
232 11.61   3.39    No  Sat Dinner  2   0.291990
67  3.07    1.00    Yes Sat Dinner  1   0.325733
178 9.60    4.00    Yes Sun Dinner  2   0.416667
172 7.25    5.15    Yes Sun Dinner  2   0.710345

#对数据集按抽烟进行分组
group = tips.groupby('smoker')
group
# 使用apply方法分别求出抽烟和不抽烟的客户中的消费占比排在前五客户
group.apply(top)
打印结果:
    otal_bill   tip smoker  day time    size    tip_pct
smoker                              
No  88  24.71   5.85    No  Thur    Lunch   2   0.236746
185 20.69   5.00    No  Sun Dinner  5   0.241663
51  10.29   2.60    No  Sun Dinner  2   0.252672
149 7.51    2.00    No  Thur    Lunch   2   0.266312
232 11.61   3.39    No  Sat Dinner  2   0.291990
Yes 109 14.31   4.00    Yes Sat Dinner  2   0.279525
183 23.17   6.50    Yes Sun Dinner  4   0.280535
67  3.07    1.00    Yes Sat Dinner  1   0.325733
178 9.60    4.00    Yes Sun Dinner  2   0.416667
172 7.25    5.15    Yes Sun Dinner  2   0.710345


# 显示抽烟和不抽烟客户的数量
group.size()
打印结果:
smoker
No     151
Yes     93
dtype: int64

group.count()
打印结果:
    total_bill  tip day time    size    tip_pct
smoker                      
No  151 151 151 151 151 151
Yes 93  93  93  93  93  93

# 为apply中使用的函数传参
tips.groupby(['smoker','day'],group_keys=False).apply(top,columns='total_bill',n=2)
frame = pd.DataFrame({'data1': np.random.randn(1000),
                    'data2': np.random.randn(1000)})
result = pd.cut(frame.data1,4)
pd.value_counts(result)
打印结果:
(-1.473, 0.292]     547
(0.292, 2.056]      359
(-3.245, -1.473]     74
(2.056, 3.821]       20
Name: data1, dtype: int64

result[:10]  #取前10行

# 定义一个函数,求data1数据列各项参数计算
def get_stats(group):
    return { 'min': group.min(), 'max': group.max(),
            'count':group.count(), 'mean': group.mean() }
grouped = frame.data2.groupby(result)
grouped.apply(get_stats).unstack()
打印结果:
                count       max         mean        min
data1               
(-3.245, -1.473]    74.0    2.331778    -0.002736   -2.275670
(-1.473, 0.292] 547.0   3.502541    0.040977    -2.975814
(0.292, 2.056]  359.0   3.074152    -0.030946   -2.596042
(2.056, 3.821]  20.0    1.936424    0.292270    -0.980695


# 加权平均和相关系数
df = pd.DataFrame({'category': ['a', 'a', 'a', 'a', 'b', 'b', 'b','b'],
                   'data': np.random.randn(8), 'weights': np.random.rand(8)})
grouped = df.groupby('category')
grouped.size()
打印结果:
category
a    4
b    4
dtype: int64

# 求加权平均数的函数
get_wavg = lambda g: np.average(g.data,weights=g.weights)
grouped.apply(get_wavg)
打印结果:
category
a    0.360001
b   -0.349372
dtype: float64

没有更多推荐了,返回首页