Pandas数据预处理与透视表

import pandas as pd
import numpy as np

food_info = pd.read_csv("D:\\test\\titanic_train.csv")  #此处需要转义
food_info.head(2)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th…female38.010PC 1759971.2833C85C
my_age = food_info["Age"]
print (my_age.head(6))
0 22.0 1 38.0 2 26.0 3 35.0 4 35.0 5 NaN Name: Age, dtype: float64
age_isnull = pd.isnull(my_age)  #判断数据是否缺失
print (age_isnull.head(6))
print (my_age[age_isnull].head(3))#把其中为True的数据取出来
print (len(age_isnull))  #查看有多少个数据缺失
0 False 1 False 2 False 3 False 4 False 5 True Name: Age, dtype: bool 5 NaN 17 NaN 19 NaN Name: Age, dtype: float64 891
mean_age = sum(food_info["Age"])/len(food_info["Age"]) 
print (mean_age)  #由于里面存在缺失值,所以无法计算,只能返回nan
my_age = food_info["Age"]
age_isnull = pd.isnull(my_age)  #判断数据是否缺失
good_age = food_info["Age"][age_isnull == False]  #将非缺失的数据取出来
mean_age = sum(good_age)/len(good_age)
print (mean_age)
nan 29.6991176471
food_info["Age"].mean()   #可以自动过滤丢失值
29.69911764705882
#需求:统计不同仓位的平均价格
passenger_class = [1,2,3]
fare_by_class = {}
for this_class in passenger_class:
     rows = food_info[food_info["Pclass"] == this_class] #取出这种仓位的所有行数据
     fare_for_class = rows["Fare"].mean()
     fare_by_class[this_class]=fare_for_class
print (fare_by_class)       
{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
passer_survival = food_info.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)
#数据透视表:index是指标,值是values,意思是统计1等仓的被救的平均值,这个数值是按照values来计算的,方法是按照aggfunc来指定的
print (passer_survival)   
Pclass 1 0.629630 2 0.472826 3 0.242363 Name: Survived, dtype: float64
passer_survival = food_info.pivot_table(index="Pclass",values="Age") #默认就是平均值
#查看仓位与年纪的关系
print (passer_survival)  
Pclass 1 38.233441 2 29.877630 3 25.140620 Name: Age, dtype: float64
passer_survival = food_info.pivot_table(index="Pclass",values=["Age","Fare"],aggfunc=np.sum) #默认就是平均值
#查看仓位与年纪的关系
print (passer_survival) 
Age Fare Pclass 1 7111.42 18177.4125 2 5168.83 3801.8417 3 8924.92 6714.6951
food_info.dropna(axis=0,subset=["Age","Fare"]).head(3)  #按行计算,查看本行中的指定项是否有缺失值,如果有则舍弃这一行
food_info.dropna(axis=1).head(3)#如果某一列存在缺失值则丢失
PassengerIdSurvivedPclassNameSexSibSpParchTicketFare
0103Braund, Mr. Owen Harrismale10A/5 211717.2500
1211Cumings, Mrs. John Bradley (Florence Briggs Th…female10PC 1759971.2833
2313Heikkinen, Miss. Lainafemale00STON/O2. 31012827.9250
age_num = food_info.sort_values("Age",ascending=False).head(3)#按照降序排序
print (age_num)
PassengerId Survived Pclass Name \ 630 631 1 1 Barkworth, Mr. Algernon Henry Wilson 851 852 0 3 Svensson, Mr. Johan 493 494 0 1 Artagaveytia, Mr. Ramon Sex Age SibSp Parch Ticket Fare Cabin Embarked 630 male 80.0 0 0 27042 30.0000 A23 S 851 male 74.0 0 0 347060 7.7750 NaN S 493 male 71.0 0 0 PC 17609 49.5042 NaN C
age_num.reset_index(drop=True)  #将上数结果重新编号
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
063111Barkworth, Mr. Algernon Henry Wilsonmale80.0002704230.0000A23S
185203Svensson, Mr. Johanmale74.0003470607.7750NaNS
249401Artagaveytia, Mr. Ramonmale71.000PC 1760949.5042NaNC
age_num.iloc[1]#取出这一行的数据
PassengerId                    852
Survived                         0
Pclass                           3
Name           Svensson, Mr. Johan
Sex                           male
Age                             74
SibSp                            0
Parch                            0
Ticket                      347060
Fare                         7.775
Cabin                          NaN
Embarked                         S
Name: 851, dtype: object
def get_hundered_info(column):
    return column.iloc[99]

food_info.apply(get_hundered_info) #指向此函数,并且food_info就是入参
PassengerId                  100
Survived                       0
Pclass                         2
Name           Kantor, Mr. Sinai
Sex                         male
Age                           34
SibSp                          1
Parch                          0
Ticket                    244367
Fare                          26
Cabin                        NaN
Embarked                       S
dtype: object
def get_null_num(column):
    null_num=pd.isnull(column)   #返回每一行的每一列的数据是否缺失
    null=column[null_num]        #每行每列缺失的取出来
    return len(null)             #计算本列的缺失数量
food_info.apply(get_null_num)
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
def which_class(row):
    pclass = row["Pclass"]
    if pd.isnull(pclass):
        return "unknow"
    elif pclass == 1:
        return "First_class"
    elif pclass == 2:
        return "Second_class"   
    elif pclass == 3:
        return "Third_class" 
food_info.apply(which_class,axis=1).head(3)  #将某一列的数据1,2,3替换成字符
0    Third_class
1    First_class
2    Third_class
dtype: object
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值