pandas数据分析泰坦尼克号船员信息

first_rows=food_info.head(3)#打印前3行
print(first_rows)
print(food_info.columns)#显示表头
print(food_info.shape)#总的数据规模
import pandas as pd
food_info=pd.read_csv("food_info.csv")
print(food_info)
#读取数据并显示,dataframe格式
#print(food_info.loc[6])#利用索引查找指定行
#print(food_info.loc[3:6])
print(food_info.loc[[2,5,10]])
#选取指定的列进行打印
zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]]
print(zinc_copper)
#打印以g单位的营养物质
col_names=food_info.columns.to_list()
print(col_names)
gram_colunms=[]
for c in col_names:
    if c.endswith("(g)"):
        gram_colunms.append(c)
print(gram_colunms)
gram_df=food_info[gram_colunms]
print(gram_df.head(3))
#进行加减乘除运算
print(food_info["Iron_(mg)"])
print(food_info["Iron_(mg)"]/1000)
print(food_info["Iron_(mg)"]*2)
water_energy=food_info["Water_(g)"]*food_info["Energ_Kcal"]
print(water_energy)
#把食物中铁的mg转换成g
iron_grams=food_info["Iron_(mg)"]/1000
food_info["Iron_(g)"]=iron_grams
print(food_info["Iron_(g)"])
#进行排序sort,空的NaN放最后
print(food_info["Sodium_(mg)"])
food_info.sort_values("Sodium_(mg)",inplace=True,ascending=False)
print(food_info["Sodium_(mg)"])
#泰坦尼克号船员分析
import pandas as pd
import numpy as np
titanic=pd.read_csv("titanic_train.csv")
titanic.head()
#对缺失的数据进行统计
age=titanic["Age"]
age_is_null=pd.isnull(age)
print(age_is_null)
age_null_true=age[age_is_null]
print(age_null_true)
age_null_count=len(age_null_true)
print(age_null_count)
#计算年龄的平均值
correct_mean_age=titanic["Age"].mean()
print(correct_mean_age)
#每个class的平均票价
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:
    pclass_rows = titanic[titanic["Pclass"] == this_class]
    pclass_fares = pclass_rows["Fare"]
    fare_for_class = pclass_fares.mean()
    fares_by_class[this_class] = fare_for_class
print(fares_by_class)
#买不同票的人生存几率
passenger_survival=titanic.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)
print(passenger_survival)
#买不同票的人的平均年龄
passenger_age=titanic.pivot_table(index="Pclass",values="Age",aggfunc=np.mean)
print(passenger_age)
#在不同口岸上船的总费用和生存人数
port_stats=titanic.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum)
print(port_stats)
drop_na_columns=titanic.dropna(axis=1)
new_titanic=titanic.dropna(axis=0,subset=["Age","Sex"])#删除了这两个为空的量
print(new_titanic)
row83=titanic.loc[83,"Age"]
row1000=titanic.loc[766,"Pclass"]#指定选取
print(row1000)
#对年龄进行排序,并对索引重新调整
new_survived=titanic.sort_values("Age",ascending=False)
print(new_survived[0:10])
titanic_reindexed=new_survived.reset_index(drop=True)
print(titanic_reindexed)
#自定义函数,功能是找数据
def hundredth_row(column):
    hundredth_item=column.iloc[99]
    return hundredth_item
hundredth_row=titanic.apply(hundredth_row)
print(hundredth_row)
#功能是统计空的变量个数是多少
def not_null_count(column):
    column_null=pd.isnull(column)
    null=column[column_null]
    return len(null)
column_null_count=titanic.apply(not_null_count)
print(column_null_count)
#功能是将连续的年龄离散化,分类
def is_minor(row):
    if row["Age"] < 18:
        return True
    else:
        return False

minors = titanic.apply(is_minor, axis=1)
#print minors

def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"

age_labels = titanic.apply(generate_age_label, axis=1)
print(age_labels)
#统计各个年龄段的存活率
titanic['age_labels'] = age_labels
age_group_survival = titanic.pivot_table(index="age_labels", values="Survived")
print (age_group_survival)
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 4
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值