python数据分析库pandas-三、: python数据分析处理库-Pandas

本节内容:

Pandas数据读取

Pandas索引与计算

Pandas数据预处理实例

Pandas常用预处理方法

Pandas自定义函数

Series结构

1、Pandas数据读取

import pandas

food_info= pandas.read_csv("food_info.csv")#读取一个以逗号分隔开的文件

#print(type(food_info)) #print(food_info.dtypes)

print(help(pandas.read_csv))

first_rows = food_info.head()#不加参数默认显示前5条数据,指定参数后可根据参数进行显示

print(first_rows )

print(food_info.head(3))#显示前3条数据

print(food_info.tail(3))#显示后3条数据

print(food_info.columns)#显示列名

print(food_info.shape)#多少行多少列

2、Pandas索引与计算

索引:

#要执行下列程序必先执行上面的程序

#pandas uses zero-indexing

#Seriesobject representing the row at index 0.

print(food_info.loc[0])#通过调用loc这个函数来进行索引,取出第一行的元素

# Seriesobjectrepresenting the seventh row.

print(food_info.loc[6])#通过调用loc这个函数来进行索引,取出第7行的元素

# Willthrow an error: "KeyError: "the label [8620] is not in the [index]""print(food_info.loc[8620])#当行数超出时会报错(8618, 36)

#Theobject dtype is equivalent to a string in Python

#object - For stringvalues

#int -For integer values

#float - For floatvalues

#datetime-For time values

#bool -For Boolean values

print(food_info.dtypes)#看每个元素的数据类型

# Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6.

food_info.loc[3:6]

# Returns a DataFrame containing the rows at indexes2, 5, and 10. Either of the following approaches will work.

# Method1two_five_ten= [2,5,10]

food_info.loc[two_five_ten]

# Method2food_info.loc[[2,5,10]]#返回一个包含了2,5,10行的数据框

#Series object representing the "NDB_No" column.#ndb_col = food_info["NDB_No"]#根据"NDB_No"这个列名来打印此列#print(ndb_col)#Alternatively, you can access a column by passing in a string variable.

col_name = "NDB_No"ndb_col=food_info[col_name]print(ndb_col)

importpandas

food_info= pandas.read_csv("food_info.csv")#columns = ["Zinc_(mg)", "Copper_(mg)"]#指定要拿的两个列,存在一个列表中#zinc_copper = food_info[columns]#print(zinc_copper)

zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]]print(zinc_copper)

#print(food_info.columns)#print(food_info.head(2))

col_names = food_info.columns.tolist() #把所有的列名取出来并放在一个列表中

print(col_names)

gram_columns= [] #构造一个空列表

for c in col_names: #对存放了列名的列表进行遍历

if c.endswith("(g)"): #对每一个col_names中的列名进行查看,如果是以"(g)"结尾的列名,则把它加到gram_columns列表里

gram_columns.append(c)print(gram_columns)

gram_df= food_info[gram_columns] #根据列名,输出对应的每一列

print(gram_df.head(3))#只输出前三行

加减乘除计算:

print(food_info["Iron_(mg)"])#打印列名为"Iron_(mg)"的这一列

div_1000 = food_info["Iron_(mg)"] / 1000 #将这一列的值都除以1000

print(div_1000)#Adds 100 to each value in the column and returns a Series object.

add_100 = food_info["Iron_(mg)"] + 100

print(add_100 )#Subtracts 100 from each value in the column and returns a Series object.

sub_100 = food_info["Iron_(mg)"] - 100

print(sub_100 )#Multiplies each value in the column by 2 and returns a Series object.

mult_2 = food_info["Iron_(mg)"]*2

print(mult_2 )

#It applies the arithmetic operator to the first value in both columns, the second value in both columns, and so on

water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]#对应位置上的两个数相乘

weighted_protein = food_info["Protein_(g)"] * 2

weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]

initial_rating = weighted_protein + weighted_fat

print(weighted_protein,weighted_fat,initial_rating)

#the "Vit_A_IU" column ranges from 0 to 100000, while the "Fiber_TD_(g)" column ranges from 0 to 79#For certain calculations, columns like "Vit_A_IU" can have a greater effect on the result,#due to the scale of the values#The largest value in the "Energ_Kcal" column.

max_calories = food_info["Energ_Kcal"].max() #取列名为"Energ_Kcal"这一列最大的数#Divide the values in "Energ_Kcal" by the largest value.

normalized_calories = food_info["Energ_Kcal"] /max_calories

normalized_protein= food_info["Protein_(g)"] / food_info["Protein_(g)"].max()

normalized_fat= food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max()

food_info["Normalized_Protein"] =normalized_protein #在food_info中加入新的列,列名为"Normalized_Protein"

food_info["Normalized_Fat"] = normalized_fat

print(food_info.shape) #打印food_info.csv的行列数,(8618, 36)

iron_grams = food_info["Iron_(mg)"] / 1000food_info["Iron_(g)"] = iron_grams #在food_info中加入新的列,列名为Iron_(g)

print(food_info.shape) #(8618, 37)

#By default, pandas will sort the data by the column we specify in ascending order and return a new DataFrame#Sorts the DataFrame in-place, rather than returning a new DataFrame.对DataFrame进行就地排序,而不是返回新的DataFrame。

print(food_info["Sodium_(mg)"])

food_info.sort_values("Sodium_(mg)", inplace=True)#默认对"Sodium_(mg)"这一列从小到大进行排序

print(food_info["Sodium_(mg)"])#Sorts by descending order, rather than ascending.按降序排序,而不是升序排序。

food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False)print(food_info["Sodium_(mg)"])

3、Pandas数据预处理实例

泰坦尼克号:

把数据集读取进来

importpandas as pdimportnumpy as np

titanic_survival= pd.read_csv("titanic_train.csv")

titanic_survival.head()

一些数据的处理:

#The Pandas library uses NaN(缺失值), which stands for "not a number", to indicate a missing value.#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values

age = titanic_survival["Age"] #把"Age"这一列的值放在age这个列表中

print(age.loc[0:10]) #看一下age这一列的前11个值

age_is_null = pd.isnull(age) #判断age中那些样本的值是空的(缺失)

print(age_is_null) #返回一系列真值和假值

age_null_true = age[age_is_null] #把返回ture的这个位置拿出来做索引,找出这些空缺值

print(age_null_true)

age_null_count= len(age_null_true) #看空缺值的个数

print(age_null_count)

#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value

mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])print(mean_age) #nan 因为空缺值的存在导致无法计算均值

把空缺值去掉:(并不需要把所有的空缺值去掉,有时候会用中位数,均值来替换空缺值以保证数据的完整)

#we have to filter out the missing values before we calculate the mean.在计算平均值之前,我们必须过滤掉遗漏的值。

good_ages = titanic_survival["Age"][age_is_null == False] #把"Age"中不是空缺的值拿出来放在新的列表“good_ages”中#print good_ages

correct_mean_age = sum(good_ages) / len(good_ages) #计算新列表的均值

print(correct_mean_age) #29.6991176471

#missing data is so common that many pandas methods automatically filter for it丢失的数据是如此常见,以至于许多熊猫方法会自动为其过滤

correct_mean_age = titanic_survival["Age"].mean() #通过调用.mean()自动过滤空缺值,再进行求均值

print(correct_mean_age) #29.69911764705882

#mean fare for each class 每个船舱等级(1,2,3)的平均票价

passenger_classes = [1, 2, 3]

fares_by_class={}for this_class in passenger_classes: #对船舱等级进行遍历

pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class] #先取出1等级的数据,再取2等级的数据,最后取三等级的数据

pclass_fares = pclass_rows["Fare"] #把1等级的船费票价取出来放pclass_fares,然后再是2...,3

fare_for_class = pclass_fares.mean() #求1等级的船费票价的均值

fares_by_class[this_class] = fare_for_class #求1,2,3等级的平均票价放在fares_by_class这个字典里面。

print(fares_by_class) #{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}

.pivot_table()

#index tells the method which column to group by index告诉方法按哪个列分组#values is the column that we want to apply the calculation to 值是我们要应用计算的列#aggfunc specifies the calculation we want to perform aggfunc指定我们要执行的计算

passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean) #.pivot_table()统计一个量与其他量之间的关系

print(passenger_survival)###

看船舱等级与获救乘客之间有什么联系,1等舱平均获救多少人,2等舱平均获救多少人,3等舱平均获救多少人

Pclass1 0.629630

2 0.472826

3 0.242363Name: Survived, dtype: float64

船舱等级越高与获救率越高###

passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age") #未指定 aggfunc,默认按照求均值的方法去计算print(passenger_age)###

看船舱等级与年龄之间的一个联系

Pclass1 38.233441

2 29.877630

3 25.140620Name: Age, dtype: float64

船舱等级越高坐着年龄越大的人###

port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)#看一个量与另外两个量之间的关系

print(port_stats)###

看登船地点C、Q、S与票价收费总额和获救总人数的关系

Fare Survived

Embarked

C10072.2962 93Q1022.2543 30S17439.3988 217

###

.dropna()函数

#specifying axis=1 or axis="columns" will drop any columns that have null values指定axis=1或axis="columns"将删除任何具有null值的列

drop_na_columns = titanic_survival.dropna(axis=1) #删除任何具有null值的列

new_titanic_survival= titanic_survival.dropna(axis=0,subset=["Age", "Sex"])#将"Age","Sex"这两列的中有null的行删除

print(new_titanic_survival)

具体定位到一个值:

row_index_83_age = titanic_survival.loc[83,"Age"] #定位带编号为83的样本,特征量为"Age"的这一个值

row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]print(row_index_83_age) #28.0

print(row_index_1000_pclass) #1

.reset_index(drop=True)

new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)#对"Age"这一列进行降序排列

print(new_titanic_survival[0:10])

titanic_reindexed= new_titanic_survival.reset_index(drop=True) #对索引值进行重新排序 drop=True:原来的不要了,形成新的值

print(titanic_reindexed.iloc[0:10])

4、自定义函数并应用:.apply()

#This function returns the hundredth item from a series 这个函数返回系列中的第一百项

defhundredth_row(column):#Extract the hundredth item 提取第一百项

hundredth_item = column.iloc[99]returnhundredth_item#Return the hundredth item from each column

hundredth_row =titanic_survival.apply(hundredth_row)print(hundredth_row)

defnot_null_count(column):

column_null= pd.isnull(column)#查看列值是否空缺,将真假值返回

null = column[column_null]#用真值做索引,取出空缺值放到null中

returnlen(null)

column_null_count= titanic_survival.apply(not_null_count)#通过.apply()调用自定义得函数

print(column_null_count )

#By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.#通过传入axis=1参数,我们可以使用DataFrame.apply()方法遍历行而不是列。

def which_class(row): #通过自定义的which_class函数对船舱等级进行数据类型转换

pclass = row["Pclass"]ifpd.isnull(pclass):return "Unknown"

elif pclass == 1:return "First Class"

elif pclass == 2:return "Second Class"

elif pclass == 3:return "Third Class"classes= titanic_survival.apply(which_class, axis=1)print(classes)

defis_minor(row):if row["Age"] < 18:returnTrueelse:returnFalse

minors= titanic_survival.apply(is_minor, axis=1)#print(minors)

defgenerate_age_label(row):

age= row["Age"]ifpd.isnull(age):return "unknown"

elif age < 18:return "minor"

else:return "adult"age_labels= titanic_survival.apply(generate_age_label, axis=1)print(age_labels)#把连续值转换为离散值

titanic_survival["age_labels"] =age_labels

age_group_survival= titanic_survival.pivot_table(index="age_labels", values="Survived")print(age_group_survival )###

age_labels

adult0.381032minor0.539823unknown0.293785Name: Survived, dtype: float64###

5、Series结构

#Series (collection of values) 一行或一列#DataFrame (collection of Series objects)#Panel (collection of DataFrame objects)

#A Series object can hold many data types, including#float - for representing float values#int - for representing integer values#bool - for representing Boolean values#datetime64[ns] - for representing date & time, without time-zone#datetime64[ns, tz] - for representing date & time, with time-zone#timedelta[ns] - for representing differences in dates & times (seconds, minutes, etc.)#category - for representing categorical values#object - for representing String values

#FILM - film name#RottenTomatoes - Rotten Tomatoes critics average score#RottenTomatoes_User - Rotten Tomatoes user average score#RT_norm - Rotten Tomatoes critics average score (normalized to a 0 to 5 point system)#RT_user_norm - Rotten Tomatoes user average score (normalized to a 0 to 5 point system)#Metacritic - Metacritic critics average score#Metacritic_User - Metacritic user average score

importpandas as pd

fandango= pd.read_csv("fandango_score_comparison.csv")

series_film= fandango["FILM"]print(series_film[0:5])

series_rt= fandango["RottenTomatoes"]print (series_rt[0:5])###

0 Avengers: Age of Ultron (2015)1 Cinderella (2015)2 Ant-Man (2015)3 Do You Believe? (2015)4 Hot Tub Time Machine 2 (2015)

Name: FILM, dtype: object

074

1 85

2 80

3 18

4 14Name: RottenTomatoes, dtype: int64###

#Import the Series object from pandas

from pandas importSeries

film_names= series_film.values #通过.values 取film这一列的值

print(type(film_names)) #,可以证明pandas是在numpy上进行封装#print(film_names)

print(len(film_names)) #146

rt_scores =series_rt.values#print(rt_scores)

print(len(rt_scores)) #146

series_custom = Series(rt_scores , index=film_names) #可用"str’型来做索引,给出对应评分

series_custom[["Minions (2015)", "Leviathan (2014)"]]###

Minions (2015) 54Leviathan (2014) 99dtype: int64###

#int index is also aviable

series_custom = Series(rt_scores , index=film_names)

series_custom[["Minions (2015)", "Leviathan (2014)"]]

fiveten= series_custom[5:10]print(fiveten)###

The Water Diviner (2015) 63Irrational Man (2015) 42Top Five (2014) 86Shaun the Sheep Movie (2015) 99Love& Mercy (2015) 89dtype: int64###

original_index =series_custom.index.tolist()print(type(original_index)) #

print(original_index)

sorted_index=sorted(original_index)

sorted_by_index=series_custom.reindex(sorted_index)print(sorted_by_index)#按字母进行排序

sc2 =series_custom.sort_index()

sc3=series_custom.sort_values()print(sc2[0:10])print(sc3[0:10])

#The values in a Series object are treated as an ndarray, the core data type in NumPy

importnumpy as np#Add each value with each other

print(np.add(series_custom, series_custom) )#Apply sine function to each value

np.sin(series_custom)#Return the highest value (will return a single value not a Series)

np.max(series_custom)

#will actually return a Series object with a boolean value for each film

series_custom > 50series_greater_than_50= series_custom[series_custom > 50]

criteria_one= series_custom > 50criteria_two= series_custom < 75both_criteria= series_custom[criteria_one &criteria_two]print(both_criteria)

#data alignment same index

rt_critics = Series(fandango["RottenTomatoes"].values, index=fandango["FILM"])

rt_users= Series(fandango["RottenTomatoes_User"].values, index=fandango["FILM"])

rt_mean= (rt_critics + rt_users)/2#求两个媒体评分的均值

print(rt_mean)

importpandas as pd#will return a new DataFrame that is indexed by the values in the specified column#and will drop that column from the DataFrame#without the FILM column dropped

fandango = pd.read_csv("fandango_score_comparison.csv")printtype(fandango)

fandango_films= fandango.set_index("FILM", drop=False)#print(fandango_films.index)

#Slice using either bracket notation or loc[]

fandango_films["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]

fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]#Specific movie

fandango_films.loc["Kumiko, The Treasure Hunter (2015)"]#Selecting list of movies

movies = ["Kumiko, The Treasure Hunter (2015)", "Do You Believe? (2015)", "Ant-Man (2015)"]

fandango_films.loc[movies]#When selecting multiple rows, a DataFrame is returned,#but when selecting an individual row, a Series object is returned instead

#The apply() method in Pandas allows us to specify Python logic#The apply() method requires you to pass in a vectorized operation#that can be applied over each Series object.

importnumpy as np#returns the data types as a Series

types =fandango_films.dtypes#print types#filter data types to just floats, index attributes returns just column names

float_columns = types[types.values == "float64"].index#use bracket notation to filter columns to just float columns

float_df =fandango_films[float_columns]#print float_df#`x` is a Series object representing a column

deviations = float_df.apply(lambdax: np.std(x))print(deviations)

rt_mt_user = float_df[["RT_user_norm", "Metacritic_user_nom"]]

rt_mt_user.apply(lambda x: np.std(x), axis=1)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值