源码下载地址:
http://download.csdn.net/download/adam_zs/10174600
import pandas
food_info = pandas.read_csv("food_info.csv") # 默认第一行作为列名
print(food_info)
print(type(food_info)) # DataFrame
print(food_info.dtypes)
first_rows = food_info.head() # 默认显示前5行
print(first_rows)
print(food_info.head(3)) # 显示前3行
print(food_info.tail(3)) # 显示后3行
print(food_info.shape)
print(food_info.columns) # 列名
'''
['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)',
'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)',
'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)',
'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)',
'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)',
'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)',
'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg',
'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)',
'Cholestrl_(mg)']
'''
print(food_info.loc[0]) # 第一行
print(food_info.loc[6]) # 第七行
# print(food_info.loc[8620]) #KeyError: 'the label [8620] is not in the [index]'
print(food_info.loc[3:6]) # 切片,返回3,4,5,6行,包括第3行和第6行
print(food_info.loc[[2, 5, 10]]) # 返回2, 5, 10行
print(food_info["NDB_No"]) # 显示NDB_No列数据
print(food_info[["Zinc_(mg)", "Copper_(mg)"]]) # 显示Zinc_(mg),Copper_(mg)列数据
col_names = food_info.columns.tolist()
gram_columns = []
for c in col_names:
if c.endswith("(g)"):
gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))
import pandas
food_info = pandas.read_csv("food_info.csv")
col_names = food_info.columns.tolist()
print(col_names)
# ['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)',
# #'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)',
# 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)',
# 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)',
# 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)',
# 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)']
print(food_info[col_names])
print(food_info["Iron_(mg)"])
print(food_info["Iron_(mg)"] / 1000)
print(food_info["Iron_(mg)"] + 100)
print(food_info["Iron_(mg)"] - 100)
print(food_info["Iron_(mg)"] * 2)
# 对应位置相乘、相除
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
iron_grams = food_info["Iron_(mg)"] / 1000
food_info["Iron_(g)"] = iron_grams
weighted_protein = food_info["Protein_(g)"] * 2
weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
initial_rating = weighted_protein + weighted_fat
print(food_info["Energ_Kcal"])
max_calories = food_info["Energ_Kcal"].max()
normalized_calories = food_info["Energ_Kcal"] / max_calories
normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max()
normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max()
food_info["Normalized_Protein"] = normalized_protein
food_info["Normalized_Fat"] = normalized_fat
# inplace=True,对DataFrame进行就地排序,而不是返回新的DataFrame。
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=True) # 升序排列,默认升序
print(food_info["Sodium_(mg)"])
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False) # 降序排列
print(food_info["Sodium_(mg)"])
import pandas as pd
import numpy as np
# 泰克尼克号获救信号
'''
'PassengerId' 乘客id
'Survived' 标签值
'Pclass' 仓位等级
'Name' 姓名
'Sex' 性别
'Age' 年龄
'SibSp' 兄弟姐妹数量
'Parch' 老人孩子总数
'Ticket' 票号
'Fare' 票价
'Cabin' 座位
'Embarked' 登船地点
'''
# pd.set_option('display.height', 1000)
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
titanic_survival = pd.read_csv("titanic_train.csv")
print(titanic_survival.columns)
print(titanic_survival.head())
# The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
# we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
# Age没有值的数量
age = titanic_survival["Age"]
print(age)
age_is_null = pd.isnull(age) # pd.isnull true:是缺失值 flase:不是缺失值
print(age_is_null)
age_null_true = age[age_is_null]
print(len(age_null_true)) # Age缺失的数量 177
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print(mean_age) # nan
good_ages = titanic_survival["Age"][age_is_null == False]
print(sum(good_ages) / len(good_ages)) # 29.6991176471
correct_mean_age = titanic_survival["Age"].mean() # mean,平均年龄,自动筛选出没有age的数据
print(correct_mean_age) # 29.69911764705882
# 每个仓位票价的均价
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:
pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
fare_for_class = pclass_rows["Fare"].mean()
fares_by_class[this_class] = fare_for_class
print(fares_by_class)
# pivot_table 数据统计
# index 按照那个分组
# values 是我们要应用计算的列
# aggfunc 指定我们要执行的计算
passenger_Fare = titanic_survival.pivot_table(index="Pclass", values="Fare", aggfunc=np.mean)
print(passenger_Fare)
passenger_Fare = titanic_survival.pivot_table(index="Pclass", values="Fare") # 默认求平均值
print(passenger_Fare)
# 按照Embarked进行分组,求Fare,Survived的和
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare", "Survived"], aggfunc=np.sum)
print(port_stats)
print(titanic_survival)
# 指定axis= 1或axis='columns' 删除任何具有空值的列
print(titanic_survival.fillna(0)) # 缺失值都填充为0
print(titanic_survival.dropna(axis=1)) # 删除有空值的列
print(titanic_survival.dropna(axis=0)) # 删除有空值的行
print(titanic_survival.dropna(axis=0, subset=["Age", "Sex"])) # 删除"Age", "Sex"有空值的行
row_index_83_age = titanic_survival.loc[83, "Age"] # 行号、列名
row_index_1000_pclass = titanic_survival.loc[766, "Pclass"]
print(titanic_survival.loc[83])
print(row_index_83_age)
print(titanic_survival.loc[766])
print(row_index_1000_pclass)
new_titanic_survival = titanic_survival.sort_values("Age", ascending=True)
print(new_titanic_survival)
print(new_titanic_survival.loc[:10])
itanic_reindexed = new_titanic_survival.reset_index(drop=True) # 重建索引,返回重建索引后的矩阵
print(itanic_reindexed.loc[:10])
# 返回一系列的第一百个项目
def hundredth_row(column):
return column.loc[99]
# apply运行自定义函数
hundredth_row = titanic_survival.apply(hundredth_row)
print(hundredth_row)
# 缺失值的数量
def not_null_count(column):
column_null = pd.isnull(column)
return len(titanic_survival[column_null])
print(pd.isnull(titanic_survival))
print(len(pd.isnull(titanic_survival))) # 891
column_null_count = titanic_survival.apply(not_null_count, axis=0) # axis=0 列,axis=1 行
print(column_null_count)
def which_class(row):
pclass = row["Pclass"]
if pd.isnull(pclass):
return "Unknown"
elif pclass == 1:
return "First Class"
elif pclass == 2:
return "Second Class"
elif pclass == 3:
return "Third Class"
classes = titanic_survival.apply(which_class, axis=1)
print(classes)
def is_minor(row):
if row["Age"] > 18:
return True
else:
return False
ages = titanic_survival.apply(is_minor, axis=1)
print(ages)
def generate_age_label(row):
age = row["Age"]
if pd.isnull(age):
return "unknown"
elif age < 18:
return "minor"
else:
return "adult"
age_labels = titanic_survival.apply(generate_age_label, axis=1)
print(age_labels)
titanic_survival['age_labels'] = age_labels
# 成年、未成年的获救率
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
print(age_group_survival)
import pandas as pd
pd.set_option('display.height', 9999)
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)
# Series (collection of values) 一行或者一列
# DataFrame (collection of Series objects)
# Panel (collection of DataFrame objects)
# DataFrame中一行或者一列是 Series
# Series.values 是 ndarray
# 电影评分
fandango = pd.read_csv('fandango_score_comparison.csv')
print(fandango.head())
series_film = fandango['FILM']
print(type(series_film)) # Series
print(series_film[0:5])
series_rt = fandango['RottenTomatoes']
print(series_rt[0:5])
from pandas import Series
film_names = series_film.values
print(type(film_names)) # ndarray
print(film_names)
rt_scores = series_rt.values
print(rt_scores)
# 电影名字film_names作为索引
series_custom = Series(rt_scores, index=film_names)
print(series_custom)
print(series_custom[['Minions (2015)', 'Leviathan (2014)']])
# int index is also aviable
series_custom = Series(rt_scores, index=film_names)
print(series_custom)
print(series_custom[['Minions (2015)', 'Leviathan (2014)']])
fiveten = series_custom[5:10]
print(fiveten)
original_index = series_custom.index.tolist()
print(original_index)
sorted_index = sorted(original_index)
print(sorted_index)
sorted_by_index = series_custom.reindex(sorted_index)
print(sorted_by_index)
sc2 = series_custom.sort_index()
sc3 = series_custom.sort_values()
print(sc2[:10])
print(sc3[:10])
import numpy as np
print(series_custom.head())
# Add each value with each other
print(np.add(series_custom.head(), series_custom.head()))
# Apply sine function to each value
print(np.sin(series_custom))
# Return the highest value (will return a single value not a Series)
print(np.max(series_custom))
print(fandango.head())
series_custom = Series(fandango['RottenTomatoes'].values, index=fandango['FILM'].values)
print(series_custom.head())
print(series_custom > 50)
series_greater_than_50 = series_custom[series_custom > 50]
criteria_one = series_custom > 50
criteria_two = series_custom < 75
both_criteria = series_custom[criteria_one & criteria_two]
print(both_criteria)
# data alignment same index
rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM'])
rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM'])
rt_mean = (rt_critics + rt_users) / 2
print(rt_mean)
import pandas as pd
pd.set_option('display.height', 9999)
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)
fandango = pd.read_csv('fandango_score_comparison.csv')
print(fandango.head())
print(type(fandango))
# DataFrame指定索引
fandango_films = fandango.set_index('FILM', drop=True) # 以FILM作为索引,删除原FILM的列
print(fandango_films)
print(fandango_films.index)
# Slice using either bracket notation or loc[]
print(fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"])
print(fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"])
# Specific movie
print(fandango_films.loc['Kumiko, The Treasure Hunter (2015)'])
# Selecting list of movies
movies = ['Kumiko, The Treasure Hunter (2015)', 'Do You Believe? (2015)', 'Ant-Man (2015)']
print(fandango_films.loc[movies])
# When selecting multiple rows, a DataFrame is returned,
# but when selecting an individual row, a Series object is returned instead
import numpy as np
print(fandango_films.head())
types = fandango_films.dtypes
print(types)
float_columns = types[types.values == 'float64'].index
float_df = fandango_films[float_columns]
print(float_df)
# `x` is a Series object representing a column
deviations = float_df.apply(lambda x: np.std(x))
print(deviations)
# np.std标准差
print(float_df.head())
rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']]
print(rt_mt_user.apply(lambda x: np.std(x), axis=1))