本节内容:
Pandas数据读取
Pandas索引与计算
Pandas数据预处理实例
Pandas常用预处理方法
Pandas自定义函数
Series结构
1、Pandas数据读取
import pandas
food_info= pandas.read_csv("food_info.csv")#读取一个以逗号分隔开的文件
#print(type(food_info)) #print(food_info.dtypes)
print(help(pandas.read_csv))
first_rows = food_info.head()#不加参数默认显示前5条数据,指定参数后可根据参数进行显示
print(first_rows )
print(food_info.head(3))#显示前3条数据
print(food_info.tail(3))#显示后3条数据
print(food_info.columns)#显示列名
print(food_info.shape)#多少行多少列
2、Pandas索引与计算
索引:
#要执行下列程序必先执行上面的程序
#pandas uses zero-indexing
#Seriesobject representing the row at index 0.
print(food_info.loc[0])#通过调用loc这个函数来进行索引,取出第一行的元素
# Seriesobjectrepresenting the seventh row.
print(food_info.loc[6])#通过调用loc这个函数来进行索引,取出第7行的元素
# Willthrow an error: "KeyError: "the label [8620] is not in the [index]""print(food_info.loc[8620])#当行数超出时会报错(8618, 36)
#Theobject dtype is equivalent to a string in Python
#object - For stringvalues
#int -For integer values
#float - For floatvalues
#datetime-For time values
#bool -For Boolean values
print(food_info.dtypes)#看每个元素的数据类型
# Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6.
food_info.loc[3:6]
# Returns a DataFrame containing the rows at indexes2, 5, and 10. Either of the following approaches will work.
# Method1two_five_ten= [2,5,10]
food_info.loc[two_five_ten]
# Method2food_info.loc[[2,5,10]]#返回一个包含了2,5,10行的数据框
#Series object representing the "NDB_No" column.#ndb_col = food_info["NDB_No"]#根据"NDB_No"这个列名来打印此列#print(ndb_col)#Alternatively, you can access a column by passing in a string variable.
col_name = "NDB_No"ndb_col=food_info[col_name]print(ndb_col)
importpandas
food_info= pandas.read_csv("food_info.csv")#columns = ["Zinc_(mg)", "Copper_(mg)"]#指定要拿的两个列,存在一个列表中#zinc_copper = food_info[columns]#print(zinc_copper)
zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]]print(zinc_copper)
#print(food_info.columns)#print(food_info.head(2))
col_names = food_info.columns.tolist() #把所有的列名取出来并放在一个列表中
print(col_names)
gram_columns= [] #构造一个空列表
for c in col_names: #对存放了列名的列表进行遍历
if c.endswith("(g)"): #对每一个col_names中的列名进行查看,如果是以"(g)"结尾的列名,则把它加到gram_columns列表里
gram_columns.append(c)print(gram_columns)
gram_df= food_info[gram_columns] #根据列名,输出对应的每一列
print(gram_df.head(3))#只输出前三行
加减乘除计算:
print(food_info["Iron_(mg)"])#打印列名为"Iron_(mg)"的这一列
div_1000 = food_info["Iron_(mg)"] / 1000 #将这一列的值都除以1000
print(div_1000)#Adds 100 to each value in the column and returns a Series object.
add_100 = food_info["Iron_(mg)"] + 100
print(add_100 )#Subtracts 100 from each value in the column and returns a Series object.
sub_100 = food_info["Iron_(mg)"] - 100
print(sub_100 )#Multiplies each value in the column by 2 and returns a Series object.
mult_2 = food_info["Iron_(mg)"]*2
print(mult_2 )
#It applies the arithmetic operator to the first value in both columns, the second value in both columns, and so on
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]#对应位置上的两个数相乘
weighted_protein = food_info["Protein_(g)"] * 2
weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
initial_rating = weighted_protein + weighted_fat
print(weighted_protein,weighted_fat,initial_rating)
#the "Vit_A_IU" column ranges from 0 to 100000, while the "Fiber_TD_(g)" column ranges from 0 to 79#For certain calculations, columns like "Vit_A_IU" can have a greater effect on the result,#due to the scale of the values#The largest value in the "Energ_Kcal" column.
max_calories = food_info["Energ_Kcal"].max() #取列名为"Energ_Kcal"这一列最大的数#Divide the values in "Energ_Kcal" by the largest value.
normalized_calories = food_info["Energ_Kcal"] /max_calories
normalized_protein= food_info["Protein_(g)"] / food_info["Protein_(g)"].max()
normalized_fat= food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max()
food_info["Normalized_Protein"] =normalized_protein #在food_info中加入新的列,列名为"Normalized_Protein"
food_info["Normalized_Fat"] = normalized_fat
print(food_info.shape) #打印food_info.csv的行列数,(8618, 36)
iron_grams = food_info["Iron_(mg)"] / 1000food_info["Iron_(g)"] = iron_grams #在food_info中加入新的列,列名为Iron_(g)
print(food_info.shape) #(8618, 37)
#By default, pandas will sort the data by the column we specify in ascending order and return a new DataFrame#Sorts the DataFrame in-place, rather than returning a new DataFrame.对DataFrame进行就地排序,而不是返回新的DataFrame。
print(food_info["Sodium_(mg)"])
food_info.sort_values("Sodium_(mg)", inplace=True)#默认对"Sodium_(mg)"这一列从小到大进行排序
print(food_info["Sodium_(mg)"])#Sorts by descending order, rather than ascending.按降序排序,而不是升序排序。
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False)print(food_info["Sodium_(mg)"])
3、Pandas数据预处理实例
泰坦尼克号:
把数据集读取进来
importpandas as pdimportnumpy as np
titanic_survival= pd.read_csv("titanic_train.csv")
titanic_survival.head()
一些数据的处理:
#The Pandas library uses NaN(缺失值), which stands for "not a number", to indicate a missing value.#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
age = titanic_survival["Age"] #把"Age"这一列的值放在age这个列表中
print(age.loc[0:10]) #看一下age这一列的前11个值
age_is_null = pd.isnull(age) #判断age中那些样本的值是空的(缺失)
print(age_is_null) #返回一系列真值和假值
age_null_true = age[age_is_null] #把返回ture的这个位置拿出来做索引,找出这些空缺值
print(age_null_true)
age_null_count= len(age_null_true) #看空缺值的个数
print(age_null_count)
#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])print(mean_age) #nan 因为空缺值的存在导致无法计算均值
把空缺值去掉:(并不需要把所有的空缺值去掉,有时候会用中位数,均值来替换空缺值以保证数据的完整)
#we have to filter out the missing values before we calculate the mean.在计算平均值之前,我们必须过滤掉遗漏的值。
good_ages = titanic_survival["Age"][age_is_null == False] #把"Age"中不是空缺的值拿出来放在新的列表“good_ages”中#print good_ages
correct_mean_age = sum(good_ages) / len(good_ages) #计算新列表的均值
print(correct_mean_age) #29.6991176471
#missing data is so common that many pandas methods automatically filter for it丢失的数据是如此常见,以至于许多熊猫方法会自动为其过滤
correct_mean_age = titanic_survival["Age"].mean() #通过调用.mean()自动过滤空缺值,再进行求均值
print(correct_mean_age) #29.69911764705882
#mean fare for each class 每个船舱等级(1,2,3)的平均票价
passenger_classes = [1, 2, 3]
fares_by_class={}for this_class in passenger_classes: #对船舱等级进行遍历
pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class] #先取出1等级的数据,再取2等级的数据,最后取三等级的数据
pclass_fares = pclass_rows["Fare"] #把1等级的船费票价取出来放pclass_fares,然后再是2...,3
fare_for_class = pclass_fares.mean() #求1等级的船费票价的均值
fares_by_class[this_class] = fare_for_class #求1,2,3等级的平均票价放在fares_by_class这个字典里面。
print(fares_by_class) #{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
.pivot_table()
#index tells the method which column to group by index告诉方法按哪个列分组#values is the column that we want to apply the calculation to 值是我们要应用计算的列#aggfunc specifies the calculation we want to perform aggfunc指定我们要执行的计算
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean) #.pivot_table()统计一个量与其他量之间的关系
print(passenger_survival)###
看船舱等级与获救乘客之间有什么联系,1等舱平均获救多少人,2等舱平均获救多少人,3等舱平均获救多少人
Pclass1 0.629630
2 0.472826
3 0.242363Name: Survived, dtype: float64
船舱等级越高与获救率越高###
passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age") #未指定 aggfunc,默认按照求均值的方法去计算print(passenger_age)###
看船舱等级与年龄之间的一个联系
Pclass1 38.233441
2 29.877630
3 25.140620Name: Age, dtype: float64
船舱等级越高坐着年龄越大的人###
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)#看一个量与另外两个量之间的关系
print(port_stats)###
看登船地点C、Q、S与票价收费总额和获救总人数的关系
Fare Survived
Embarked
C10072.2962 93Q1022.2543 30S17439.3988 217
###
.dropna()函数
#specifying axis=1 or axis="columns" will drop any columns that have null values指定axis=1或axis="columns"将删除任何具有null值的列
drop_na_columns = titanic_survival.dropna(axis=1) #删除任何具有null值的列
new_titanic_survival= titanic_survival.dropna(axis=0,subset=["Age", "Sex"])#将"Age","Sex"这两列的中有null的行删除
print(new_titanic_survival)
具体定位到一个值:
row_index_83_age = titanic_survival.loc[83,"Age"] #定位带编号为83的样本,特征量为"Age"的这一个值
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]print(row_index_83_age) #28.0
print(row_index_1000_pclass) #1
.reset_index(drop=True)
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)#对"Age"这一列进行降序排列
print(new_titanic_survival[0:10])
titanic_reindexed= new_titanic_survival.reset_index(drop=True) #对索引值进行重新排序 drop=True:原来的不要了,形成新的值
print(titanic_reindexed.iloc[0:10])
4、自定义函数并应用:.apply()
#This function returns the hundredth item from a series 这个函数返回系列中的第一百项
defhundredth_row(column):#Extract the hundredth item 提取第一百项
hundredth_item = column.iloc[99]returnhundredth_item#Return the hundredth item from each column
hundredth_row =titanic_survival.apply(hundredth_row)print(hundredth_row)
defnot_null_count(column):
column_null= pd.isnull(column)#查看列值是否空缺,将真假值返回
null = column[column_null]#用真值做索引,取出空缺值放到null中
returnlen(null)
column_null_count= titanic_survival.apply(not_null_count)#通过.apply()调用自定义得函数
print(column_null_count )
#By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.#通过传入axis=1参数,我们可以使用DataFrame.apply()方法遍历行而不是列。
def which_class(row): #通过自定义的which_class函数对船舱等级进行数据类型转换
pclass = row["Pclass"]ifpd.isnull(pclass):return "Unknown"
elif pclass == 1:return "First Class"
elif pclass == 2:return "Second Class"
elif pclass == 3:return "Third Class"classes= titanic_survival.apply(which_class, axis=1)print(classes)
defis_minor(row):if row["Age"] < 18:returnTrueelse:returnFalse
minors= titanic_survival.apply(is_minor, axis=1)#print(minors)
defgenerate_age_label(row):
age= row["Age"]ifpd.isnull(age):return "unknown"
elif age < 18:return "minor"
else:return "adult"age_labels= titanic_survival.apply(generate_age_label, axis=1)print(age_labels)#把连续值转换为离散值
titanic_survival["age_labels"] =age_labels
age_group_survival= titanic_survival.pivot_table(index="age_labels", values="Survived")print(age_group_survival )###
age_labels
adult0.381032minor0.539823unknown0.293785Name: Survived, dtype: float64###
5、Series结构
#Series (collection of values) 一行或一列#DataFrame (collection of Series objects)#Panel (collection of DataFrame objects)
#A Series object can hold many data types, including#float - for representing float values#int - for representing integer values#bool - for representing Boolean values#datetime64[ns] - for representing date & time, without time-zone#datetime64[ns, tz] - for representing date & time, with time-zone#timedelta[ns] - for representing differences in dates & times (seconds, minutes, etc.)#category - for representing categorical values#object - for representing String values
#FILM - film name#RottenTomatoes - Rotten Tomatoes critics average score#RottenTomatoes_User - Rotten Tomatoes user average score#RT_norm - Rotten Tomatoes critics average score (normalized to a 0 to 5 point system)#RT_user_norm - Rotten Tomatoes user average score (normalized to a 0 to 5 point system)#Metacritic - Metacritic critics average score#Metacritic_User - Metacritic user average score
importpandas as pd
fandango= pd.read_csv("fandango_score_comparison.csv")
series_film= fandango["FILM"]print(series_film[0:5])
series_rt= fandango["RottenTomatoes"]print (series_rt[0:5])###
0 Avengers: Age of Ultron (2015)1 Cinderella (2015)2 Ant-Man (2015)3 Do You Believe? (2015)4 Hot Tub Time Machine 2 (2015)
Name: FILM, dtype: object
074
1 85
2 80
3 18
4 14Name: RottenTomatoes, dtype: int64###
#Import the Series object from pandas
from pandas importSeries
film_names= series_film.values #通过.values 取film这一列的值
print(type(film_names)) #,可以证明pandas是在numpy上进行封装#print(film_names)
print(len(film_names)) #146
rt_scores =series_rt.values#print(rt_scores)
print(len(rt_scores)) #146
series_custom = Series(rt_scores , index=film_names) #可用"str’型来做索引,给出对应评分
series_custom[["Minions (2015)", "Leviathan (2014)"]]###
Minions (2015) 54Leviathan (2014) 99dtype: int64###
#int index is also aviable
series_custom = Series(rt_scores , index=film_names)
series_custom[["Minions (2015)", "Leviathan (2014)"]]
fiveten= series_custom[5:10]print(fiveten)###
The Water Diviner (2015) 63Irrational Man (2015) 42Top Five (2014) 86Shaun the Sheep Movie (2015) 99Love& Mercy (2015) 89dtype: int64###
original_index =series_custom.index.tolist()print(type(original_index)) #
print(original_index)
sorted_index=sorted(original_index)
sorted_by_index=series_custom.reindex(sorted_index)print(sorted_by_index)#按字母进行排序
sc2 =series_custom.sort_index()
sc3=series_custom.sort_values()print(sc2[0:10])print(sc3[0:10])
#The values in a Series object are treated as an ndarray, the core data type in NumPy
importnumpy as np#Add each value with each other
print(np.add(series_custom, series_custom) )#Apply sine function to each value
np.sin(series_custom)#Return the highest value (will return a single value not a Series)
np.max(series_custom)
#will actually return a Series object with a boolean value for each film
series_custom > 50series_greater_than_50= series_custom[series_custom > 50]
criteria_one= series_custom > 50criteria_two= series_custom < 75both_criteria= series_custom[criteria_one &criteria_two]print(both_criteria)
#data alignment same index
rt_critics = Series(fandango["RottenTomatoes"].values, index=fandango["FILM"])
rt_users= Series(fandango["RottenTomatoes_User"].values, index=fandango["FILM"])
rt_mean= (rt_critics + rt_users)/2#求两个媒体评分的均值
print(rt_mean)
importpandas as pd#will return a new DataFrame that is indexed by the values in the specified column#and will drop that column from the DataFrame#without the FILM column dropped
fandango = pd.read_csv("fandango_score_comparison.csv")printtype(fandango)
fandango_films= fandango.set_index("FILM", drop=False)#print(fandango_films.index)
#Slice using either bracket notation or loc[]
fandango_films["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]
fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]#Specific movie
fandango_films.loc["Kumiko, The Treasure Hunter (2015)"]#Selecting list of movies
movies = ["Kumiko, The Treasure Hunter (2015)", "Do You Believe? (2015)", "Ant-Man (2015)"]
fandango_films.loc[movies]#When selecting multiple rows, a DataFrame is returned,#but when selecting an individual row, a Series object is returned instead
#The apply() method in Pandas allows us to specify Python logic#The apply() method requires you to pass in a vectorized operation#that can be applied over each Series object.
importnumpy as np#returns the data types as a Series
types =fandango_films.dtypes#print types#filter data types to just floats, index attributes returns just column names
float_columns = types[types.values == "float64"].index#use bracket notation to filter columns to just float columns
float_df =fandango_films[float_columns]#print float_df#`x` is a Series object representing a column
deviations = float_df.apply(lambdax: np.std(x))print(deviations)
rt_mt_user = float_df[["RT_user_norm", "Metacritic_user_nom"]]
rt_mt_user.apply(lambda x: np.std(x), axis=1)