例1: 基本操作
代码一(导入.csv文件):
food_info = pd.read_csv('food_info.csv' )
print(type(food_info))
print(food_info.dtypes)
运行结果:
<class 'pandas .core .frame .DataFrame '>
NDB_No int64
Shrt_Desc object
Water_ (g) float64
Energ_Kcal int64
Protein_ (g) float64
Lipid_Tot_ (g) float64
Ash_ (g) float64
Carbohydrt_ (g) float64
Fiber_TD_ (g) float64
Sugar_Tot_ (g) float64
Calcium_ (mg) float64
Iron_ (mg) float64
Magnesium_ (mg) float64
Phosphorus_ (mg) float64
Potassium_ (mg) float64
Sodium_ (mg) float64
Zinc_ (mg) float64
Copper_ (mg) float64
Manganese_ (mg) float64
Selenium_ (mcg) float64
Vit_C_ (mg) float64
Thiamin_ (mg) float64
Riboflavin_ (mg) float64
Niacin_ (mg) float64
Vit_B6_ (mg) float64
Vit_B12_ (mcg) float64
Vit_A_IU float64
Vit_A_RAE float64
Vit_E_ (mg) float64
Vit_D_mcg float64
Vit_D_IU float64
Vit_K_ (mcg) float64
FA_Sat_ (g) float64
FA_Mono_ (g) float64
FA_Poly_ (g) float64
Cholestrl_ (mg) float64
dtype : object
代码二(显示.csv文件相关信息):
food_info.head(10 )
food_info.tail()
print(food_info.columns)
print(food_info.shape)
代码三(指定样本/行的索引和切片):
print(food_info.loc[3 ])
print(food_info.loc[3 :6 ])
print(food_info.loc[[99 ,100 ,101 ]])
代码四(指定属性/列的索引和切片):
print(food_info['Vit_C_(mg)' ])
print(food_info[['Vit_C_(mg)' ,'Vit_B6_(mg)' ]])
columns = ['Vit_C_(mg)' ,'Vit_B6_(mg)' ]
print(food_info[columns])
例2: 简单应用-食品信息
代码一(查找单位为(g)的属性并提取):
col_names = food_info.columns.tolist()
name_g = []
for name in col_names:
if name.endswith('(g)' ):
name_g.append(name)
print(name_g)
运行结果:
['Water_(g)' , 'Protein_(g)' , 'Lipid_Tot_(g)' , 'Ash_(g)' , 'Carbohydrt_(g)' , 'Fiber_TD_(g)' , 'Sugar_Tot_(g)' , 'FA_Sat_(g)' , 'FA_Mono_(g)' , 'FA_Poly_(g)' ]
代码二(数乘):
div_1000 = food_info['Iron_(mg)' ]/1000
print(div_1000.head(3 ))
print(food_info['Iron_(mg)' ].head(3 ))
运行结果:
0 0.00002
1 0.00016
2 0.00000
Name: Iron_(mg), dtype: float64
0 0.02
1 0.16
2 0.00
Name: Iron_(mg), dtype: float64
代码三(属性相乘、添加属性):
a = food_info['Energ_Kcal' ]
b = food_info['Water_(g)' ]
print(a.head(3 ))
print(b.head(3 ))
c = a*b
print(c.head(3 ))
print(food_info.shape)
food_info['new' ] = c
print(food_info.shape)
运行结果:
0 717
1 717
2 876
Name: Energ_Kcal, dtype: int64
0 15.87
1 15.87
2 0.24
Name: Water_(g), dtype: float64
0 11378.79
1 11378.79
2 210.24
dtype: float64
(8618 , 36 )
(8618 , 37 )
代码四(最大值、最小值、均值):
max_calories = food_info['Energ_Kcal' ].max()
print(max_calories)
mean_calories = food_info['Energ_Kcal' ].mean()
print(mean_calories)
运行结果:
902
226.43861684845672
代码五(排序1):
print(food_info['Energ_Kcal' ].head(10 ))
a = food_info.sort_values('Energ_Kcal' ,inplace=False ,ascending=False )
print(a['Energ_Kcal' ].head(10 ))
print(a.shape)
print(food_info['Energ_Kcal' ].head(10 ))
print(food_info.shape)
print(food_info['Calcium_(mg)' ] is a['Calcium_(mg)' ])
运行结果:
4282 0
6417 0
292 0
4407 0
4404 0
4208 0
276 0
4392 0
5814 0
4408 0
Name: Energ_Kcal, dtype: int64
610 902
701 902
664 902
702 902
703 902
704 902
705 902
706 902
611 902
8180 900
Name: Energ_Kcal, dtype: int64
(8618 , 36 )
4282 0
6417 0
292 0
4407 0
4404 0
4208 0
276 0
4392 0
5814 0
4408 0
Name: Energ_Kcal, dtype: int64
(8618 , 36 )
False
代码六(排序2):
print(food_info['Water_(g)' ].head(10 ))
a = food_info.sort_values('Water_(g)' ,inplace=True ,ascending=False )
print(food_info['Water_(g)' ].head(10 ))
运行结果:
4377 100.00
4376 100.00
4378 100.00
4348 100.00
4209 100.00
4372 99.98
4404 99.98
4379 99.97
4373 99.97
4407 99.97
Name: Water_(g), dtype: float64
4377 100.00
4348 100.00
4209 100.00
4376 100.00
4378 100.00
4404 99.98
4372 99.98
4379 99.97
4373 99.97
4407 99.97
Name: Water_(g), dtype: float64
例3: 数据处理实例-泰坦尼克船员获救
代码一(isnull查找缺失值):
age = titanic_survival['Age' ]
print(age.loc[0 :10 ])
age_is_null = pd.isnull(age)
print(age_is_null.loc[:10 ])
age_null_ture = age[age_is_null]
print(age_null_ture.loc[:10 ])
age_null_count = len(age_null_ture)
print(age_null_count)
运行结果:
0 22.0
1 38.0
2 26.0
3 35.0
4 35.0
5 NaN
6 54.0
7 2.0
8 27.0
9 14.0
10 4.0
Name: Age, dtype: float64
0 False
1 False
2 False
3 False
4 False
5 True
6 False
7 False
8 False
9 False
10 False
Name: Age, dtype: bool
5 NaN
Name: Age, dtype: float64
177
代码二(忽视缺失值):
good_ages = titanic_survival['Age' ][age_is_null == False ]
print(good_ages.loc[:10 ])
print(good_ages.mean())
print(age.mean())
运行结果:
0 22.0
1 38.0
2 26.0
3 35.0
4 35.0
6 54.0
7 2.0
8 27.0
9 14.0
10 4.0
Name: Age, dtype: float64
29.69911764705882
29.69911764705882
代码三(不通舱位等级对应船票均价):
P = [1 ,2 ,3 ]
fares_by_P = {}
for a in P:
P_class = titanic_survival[titanic_survival['Pclass' ] == a]
P_meanfares = P_class['Fare' ].mean()
fares_by_P[a] = P_meanfares
print(fares_by_P)
运行结果:
{1 : 84.15468749999992 , 2 : 20.66218315217391 , 3 : 13.675550101832997 }
代码四(pivot_table数据透视表):
pclass_survived_age = titanic_survival.pivot_table(index='Pclass' ,values=['Survived' ,'Age' ],aggfunc=np.mean)
print(pclass_survived_age)
运行结果:
Age Survived
Pclass
1 38.233441 0.629630
2 29.877630 0.472826
3 25.140620 0.242363
代码五(fillna和dropna):
dropna_axis0 = titanic_survival.dropna(axis=0 )
dropna_axis1 = titanic_survival.dropna(axis=1 )
dropna_cabin_axis0 = titanic_survival.dropna(axis=0 ,subset=['Cabin' ])
a = titanic_survival.fillna(3333 , inplace = False )
代码六(sort_values排序,reset_index重置index):
a = titanic_survival.sort_values('Age' ,ascending=False )
b = a.reset_index(drop=True )
例4: 自定义函数(略)
例5: Series结构(略)