数据分析处理库Pandas

例1: 基本操作

代码一(导入.csv文件):

food_info = pd.read_csv('food_info.csv')
print(type(food_info))
print(food_info.dtypes)

运行结果:

<class 'pandas.core.frame.DataFrame'>
NDB_No               int64
Shrt_Desc           object
Water_(g)          float64
Energ_Kcal           int64
Protein_(g)        float64
Lipid_Tot_(g)      float64
Ash_(g)            float64
Carbohydrt_(g)     float64
Fiber_TD_(g)       float64
Sugar_Tot_(g)      float64
Calcium_(mg)       float64
Iron_(mg)          float64
Magnesium_(mg)     float64
Phosphorus_(mg)    float64
Potassium_(mg)     float64
Sodium_(mg)        float64
Zinc_(mg)          float64
Copper_(mg)        float64
Manganese_(mg)     float64
Selenium_(mcg)     float64
Vit_C_(mg)         float64
Thiamin_(mg)       float64
Riboflavin_(mg)    float64
Niacin_(mg)        float64
Vit_B6_(mg)        float64
Vit_B12_(mcg)      float64
Vit_A_IU           float64
Vit_A_RAE          float64
Vit_E_(mg)         float64
Vit_D_mcg          float64
Vit_D_IU           float64
Vit_K_(mcg)        float64
FA_Sat_(g)         float64
FA_Mono_(g)        float64
FA_Poly_(g)        float64
Cholestrl_(mg)     float64
dtype: object

代码二(显示.csv文件相关信息):

food_info.head(10) #显示前10行数据
food_info.tail() #显示后5行数据
print(food_info.columns) #显示列名(属性)
print(food_info.shape) #显示矩阵形状

代码三(指定样本/行的索引和切片):

print(food_info.loc[3]) #索引第四个样本的数据
print(food_info.loc[3:6]) #索引第四个至第七个样本的数据
print(food_info.loc[[99,100,101]]) #索引指定位置的样本

代码四(指定属性/列的索引和切片):

print(food_info['Vit_C_(mg)']) #用属性的str索引对应列
print(food_info[['Vit_C_(mg)','Vit_B6_(mg)']]) #注意索引多个属性时,index中的多个属性放在一个中括号内
columns = ['Vit_C_(mg)','Vit_B6_(mg)']
print(food_info[columns]) #索引多列

例2: 简单应用-食品信息

代码一(查找单位为(g)的属性并提取):

col_names = food_info.columns.tolist() #tolist查看当前所有的属性
name_g = [] #生成一个空列表
for name in col_names: #冒号
    if name.endswith('(g)'): #冒号,引号,endswith
        name_g.append(name)
print(name_g)

运行结果:

['Water_(g)', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)']

代码二(数乘):

div_1000 = food_info['Iron_(mg)']/1000 #对应列所有元素除以1000
print(div_1000.head(3))
print(food_info['Iron_(mg)'].head(3))

运行结果:

0    0.00002
1    0.00016
2    0.00000
Name: Iron_(mg), dtype: float64
0    0.02
1    0.16
2    0.00
Name: Iron_(mg), dtype: float64

代码三(属性相乘、添加属性):

a = food_info['Energ_Kcal']
b = food_info['Water_(g)']
print(a.head(3))
print(b.head(3))
c = a*b #相同样本数的属性,对应属性相乘
print(c.head(3))
print(food_info.shape)
food_info['new'] = c #不要用append,append没有提供属性名
print(food_info.shape)

运行结果:

0    717
1    717
2    876
Name: Energ_Kcal, dtype: int64
0    15.87
1    15.87
2     0.24
Name: Water_(g), dtype: float64
0    11378.79
1    11378.79
2      210.24
dtype: float64
(8618, 36)
(8618, 37)

代码四(最大值、最小值、均值):

max_calories = food_info['Energ_Kcal'].max()
print(max_calories)
mean_calories = food_info['Energ_Kcal'].mean()
print(mean_calories)

运行结果:

902
226.43861684845672

代码五(排序1):

print(food_info['Energ_Kcal'].head(10))
a = food_info.sort_values('Energ_Kcal',inplace=False,ascending=False) #不替换原矩阵,降序排列
print(a['Energ_Kcal'].head(10))
print(a.shape) #可见a的和food_info的shape相同
print(food_info['Energ_Kcal'].head(10)) #原矩阵的排序不变
print(food_info.shape)
print(food_info['Calcium_(mg)'] is a['Calcium_(mg)'])

运行结果:

4282    0
6417    0
292     0
4407    0
4404    0
4208    0
276     0
4392    0
5814    0
4408    0
Name: Energ_Kcal, dtype: int64
610     902
701     902
664     902
702     902
703     902
704     902
705     902
706     902
611     902
8180    900
Name: Energ_Kcal, dtype: int64
(8618, 36)
4282    0
6417    0
292     0
4407    0
4404    0
4208    0
276     0
4392    0
5814    0
4408    0
Name: Energ_Kcal, dtype: int64
(8618, 36)
False

代码六(排序2):

print(food_info['Water_(g)'].head(10))
a = food_info.sort_values('Water_(g)',inplace=True,ascending=False) #不替换,升序,有没有a都一样,a是'NoneType' object
#print(a['Water_(g)'].head(10))
#print(a.shape) 
print(food_info['Water_(g)'].head(10)) #原矩阵的排序改变

运行结果:

4377    100.00
4376    100.00
4378    100.00
4348    100.00
4209    100.00
4372     99.98
4404     99.98
4379     99.97
4373     99.97
4407     99.97
Name: Water_(g), dtype: float64
4377    100.00
4348    100.00
4209    100.00
4376    100.00
4378    100.00
4404     99.98
4372     99.98
4379     99.97
4373     99.97
4407     99.97
Name: Water_(g), dtype: float64

例3: 数据处理实例-泰坦尼克船员获救

代码一(isnull查找缺失值):

age = titanic_survival['Age']
print(age.loc[0:10])
age_is_null = pd.isnull(age) #isnull有值返回False,无值返回True
print(age_is_null.loc[:10])
age_null_ture = age[age_is_null] #age_is_null存放了True和False,相当于索引,True返回原值,False不返回
print(age_null_ture.loc[:10]) #得到前十个值中缺失的样本,而不是前十个缺失的样本
age_null_count = len(age_null_ture)
print(age_null_count)

运行结果:

0     22.0
1     38.0
2     26.0
3     35.0
4     35.0
5      NaN
6     54.0
7      2.0
8     27.0
9     14.0
10     4.0
Name: Age, dtype: float64
0     False
1     False
2     False
3     False
4     False
5      True
6     False
7     False
8     False
9     False
10    False
Name: Age, dtype: bool
5   NaN
Name: Age, dtype: float64
177

代码二(忽视缺失值):

good_ages = titanic_survival['Age'][age_is_null == False] #忽视缺失值,用两个等于号
print(good_ages.loc[:10]) #显示了前十个样本中有值的样本值
print(good_ages.mean())
print(age.mean()) #其实可由mean()直接求得平均值,自动忽略缺失值

运行结果:

0     22.0
1     38.0
2     26.0
3     35.0
4     35.0
6     54.0
7      2.0
8     27.0
9     14.0
10     4.0
Name: Age, dtype: float64
29.69911764705882
29.69911764705882

代码三(不通舱位等级对应船票均价):

P = [1,2,3] #设置舱位等级
fares_by_P = {} #创建舱位等级对应船票均价的空字典
for a in P:
    P_class = titanic_survival[titanic_survival['Pclass'] == a] #生成了当前p对应的矩阵
    P_meanfares = P_class['Fare'].mean() #对当前p生成的矩阵的Fare属性求平均
    fares_by_P[a] = P_meanfares #添加到字典
print(fares_by_P)    

运行结果:

{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}

代码四(pivot_table数据透视表):

pclass_survived_age = titanic_survival.pivot_table(index='Pclass',values=['Survived','Age'],aggfunc=np.mean) #aggfunc默认np.mean
print(pclass_survived_age)

运行结果:

              Age  Survived
Pclass                     
1       38.233441  0.629630
2       29.877630  0.472826
3       25.140620  0.242363

代码五(fillna和dropna):

dropna_axis0 = titanic_survival.dropna(axis=0) #删除有缺失值的样本
dropna_axis1 = titanic_survival.dropna(axis=1) #删除有缺失值的属性
dropna_cabin_axis0 = titanic_survival.dropna(axis=0,subset=['Cabin']) #删除Cabin有缺失值的样本
a = titanic_survival.fillna(3333, inplace = False) #如果希望在原DataFrame中修改,则把inplace设置为True,这样a为空

代码六(sort_values排序,reset_index重置index):

a = titanic_survival.sort_values('Age',ascending=False) #年龄降序
b = a.reset_index(drop=True) #reset_index,原行索引作为一列保留,列名为index,drop=True可删除保留的原index

例4: 自定义函数(略)

例5: Series结构(略)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值