import pandas as pd
food_info=pd.read_csv("C:/Users/de/Desktop/唐宇迪数据集/food_info.csv")
print(type(food_info))#之前学习的numpy的类型是ndarray
print(food_info.dtypes)#pandas中常见的类型int float Object(在pandas中称字符型为object ) datatime bool
print(help(pd.read_csv))
print(food_info.head())#显示前五条数据
print(food_info.head(3))#显示前三条数据
print(food_info.tail())#显示后几行数据
print(food_info.columns)#输出表头各个指标
print(food_info.shape)#输出一共多少个样本 每个样本有几个指标
print(food_info.loc[0])#获取第一行数据
print(print(food_info.loc[3:5]))
numbers=[2,5,10]
print(food_info.loc[numbers])
#取某一列
ndb_col=food_info["NDB_No"]
print(ndb_col)
或者
colname=["NDB_No"]#或者colname="NDB_No"
print(food_info[colname])#如果获取两个列 就可以这么做 把列名放到list结构中 在传给dataframe
#数据中有的单位是g 有的是mg 进行查找
columns=food_info.columns.tolist()
print(columns)
gram_columns=[]
for c in columns:
if c.endswith("(g)"):
gram_columns.append(c)
print(food_info[gram_columns].head(3))
#将某一列的mg转换成g
col=food_info["Iron_(mg)"]/1000
print(food_info["Iron_(mg)"])
print('****')
print(col)
water_energy=food_info["Water_(g)"]*food_info["Energ_Kcal"]
iron_grams=food_info["Iron_(mg)"]/1000
print(food_info.shape)
food_info["new_col"]=iron_grams#进行新增一列
print(food_info.shape)
max_calories=food_info['Energ_Kcal'].max()
print(max_calories)#min max mean 求出某一列的最小值.....
print(food_info.shape)
#对数据进行排序 默认从小到大 缺省值在最后 ascending=Flase 是从大到小排序
food_info.sort_values("Sodium_(mg)",inplace=True)#inplace 设置是否新生成一列
print(food_info["Sodium_(mg)"])
print(food_info.shape)
titanic_survival=pd.read_csv("C:/Users/de/Desktop/唐宇迪数据集/泰坦尼克号数据.csv")
print(titanic_survival.head(3))
#Survived 数据分类 Pclass 船的等级 SibSp兄弟姐妹数量 Parch家人数量 Fare 船票价格 embarked登船
age=titanic_survival["Age"]
print(age.loc[0:10])
#对于缺失值进行处理
age=titanic_survival["Age"]
age_isnull=pd.isnull(age)
print(age_isnull)#这里TRUE或者false可以当成一个索引
age_null_true=titanic_survival[age_isnull]
print(age_null_true)
age_null_count=len(age_null_true)
print(age_null_count)
mean_age=sum(titanic_survival["Age"])/len(titanic_survival["Age"])
print(mean_age) #此时还没对缺省值做处理 所以在求年龄平均值会出现这种状况
age=titanic_survival["Age"]
good_ages=titanic_survival["Age"][pd.isnull(age)==False]
correct_mean_age=sum(good_ages)/len(good_ages)
print(correct_mean_age)
#也可以通过直接调用pandas中的mean
correct_mean_age=titanic_survival["Age"].mean()
print(correct_mean_age)
#计算每个舱位的不同等级座位对应的平均价格
passenger_classes=[1,2,3]
fares_by_class={}
for this_class in passenger_classes:
pclass_row=titanic_survival[titanic_survival["Pclass"]==this_class]
pclass_fares=pclass_row["Fare"]
fare_for_class=pclass_fares.mean()
fares_by_class[this_class]=fare_for_class
print(fares_by_class)
#统计乘坐不同种类的舱的获救的概率 之前可能会考虑用循环 这里可以不用
import numpy as np
passenger_survival=titanic_survival.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)
print(passenger_survival)
#计算船舱等级对应的平均年龄 aggfunc默认是求均值
passenger_age=titanic_survival.pivot_table(index="Pclass",values="Age")
print(passenger_age)
#统计Embarked与Fare和Survived之间的关系
port_stats=titanic_survival.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum)
print(port_stats)
#丢弃某一行中age sex是缺省值的数据
print(titanic_survival.shape)
new_data=titanic_survival.dropna(axis=0,subset=["Age","Sex"])
print(new_data.shape)
#定位到某一个具体的数据
row_index_83_age=titanic_survival.loc[83,"Age"]
print(row_index_83_age)
new_titanic_survival=titanic_survival.sort_values("Age",ascending=False)
print(new_titanic_survival[0:10])
#此时索引是乱的
titanic_reindexed=new_titanic_survival.reset_index(drop=True)
print(titanic_reindexed)
#自定义函数 获取第一百行的值
def hundredth_row(column):
hundredth_item=column.loc[99]
return hundredth_item
hundredth_row=titanic_survival.apply(hundredth_row)
print(hundredth_row)
def not_null_count(column):
column_null=pd.isnull(column)
null=titanic_survival[column_null]
return len(null)
null_count=titanic_survival.apply(not_null_count)
print(null_count)
def which_class(row):
pclass=row["Pclass"]
if pd.isnull(pclass):
return "Unknown"
elif pclass==1:
return "First Class"
elif pclass==2:
return "Second Class"
elif pclass==3:
return "Third Class"
classes=titanic_survival.apply(which_class,axis=1)
print(classes)
def generate_age_label(row):
age=row["Age"]
if pd.isnull(age):
return "Unknown"
elif age<18:
return "minor"
else:
return "adult"
age_labels=titanic_survival.apply(generate_age_label,axis=1)
print(age_labels)
#计算成年人获救的概率
titanic_survival["age_labels"]=age_labels
age_group_survival=titanic_survival.pivot_table(index="age_labels",values="Survived")
print(age_group_survival)
Series
DataFrame是相当于pandas读取进来的矩阵
dataFrame中的某一行或者某一列就称之为series
import pandas as pd fandango=pd.read_csv('E:/唐宇迪数据集/fandango.csv') series_film=fandango["FILM"] print(type(series_film)) print(series_film[0:5])
film_names=series_film.values print(type(film_names))
上图结果说明Dataframe 中是Series Series中是ndarray
import pandas as pd from pandas import Series fandango=pd.read_csv('E:/唐宇迪数据集/fandango.csv') series_film=fandango["FILM"] film_names=series_film.values series_rt=fandango["RottenTomatoes"] rt_scores=series_rt.values series_custom=Series(rt_scores,index=film_names)#用电影名字当索引查找分数 print(series_custom[['Do You Believe? (2015)','Avengers: Age of Ultron (2015)']])
print(series_custom[5:10])
#排序 original_index=series_custom.index.tolist() sorted_index=sorted(original_index) print(sorted_index) sorted_by_index=series_custom.reindex(sorted_index) print(sorted_by_index)
print('**********') print(series_custom.sort_index()) print("***********************************************") print(series_custom.sort_values())
#两个进行相加
import numpy as np print(np.add(series_custom,series_custom))
series_greater_than_50=series_custom[series_custom>50]
rt_critics=Series(fandango['RottenTomatoes'].values,index=fandango['FILM']) rt_users=Series(fandango['RottenTomatoes_User'].values,index=fandango['FILM']) rt_min=(rt_critics+rt_users)/2 print(rt_min)
fandango_films=fandango.set_index('FILM',drop=False) print(fandango_films)
print(fandango_films['Avengers: Age of Ultron (2015)':'Hot Tub Time Machine 2 (2015)'])
print(fandango_films.loc['Avengers: Age of Ultron (2015)':'Hot Tub Time Machine 2 (2015)'])
movies=['Avengers: Age of Ultron (2015)','Hot Tub Time Machine 2 (2015)'] print(fandango_films.loc[movies]) #相当于有了两个index 一个可以用String类型进行索引 一个可以使用原来的数值进行索引
fandango_films=fandango.set_index('FILM',drop=False) types=fandango_films.dtypes print(types) float_columns=types[types.values=='float64'].index float_df=fandango_films[float_columns] rt_mt_user=float_df[['RT_user_norm','Metacritic_user_nom']] print(rt_mt_user) a=rt_mt_user.apply(lambda x:np.std(x),axis=1) print(a)