Pandas数据处理基于numpy ,DateFram 由多个series组成,每个series代表一行或者一列。
1.Pandas数据读取
import pandas
food_info = pandas.read_csv('food_info.csv')
print(type(food_info)) #数据类型 DateFram
print(food_info.dtypes)
print(help(pandas.read_csv))
food_info.head(3) #显示数据 显示前3条数据
food_info.tail(4) #显示后四行
print(food_info.columns) #显示列名
print(food_info.shape) #8618个数据,36列指标
print(food_info.loc[0]) #读取第一行数据
food_info.loc[3:6] #读取3-6行数据
ndb_col = food_info['NDB_No'] #取NDB_No列
print(ndb_col)
columns = ['Zinc_(mg)','Copper_(mg)'] #取'Zinc_(mg)','Copper_(mg)'两列数据
zinc = food_info[columns]
print(zinc)
col_name = food_info.columns.tolist() #取第一列的表示元素
print(col_name) #输出第一列中所有元素
gram_columns = []
for c in col_name: #找到第一列元素中含有(g)的元素,输出出来
if c.endswith('(g)'):
gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))
print(food_info['Iron_(mg)']) #将Iron_(mg)中的元素都除以1000
div_1000 = food_info['Iron_(mg)']/1000
print(div_1000)
iron_grams = food_info['Iron_(mg)']/1000 #添加Iron_(g) 一列数据
print(food_info.shape)
food_info['Iron_(g)'] = iron_grams
print(food_info.shape)
donkey = food_info['Iron_(mg)'].max() #找到Iron_(mg)里面的最大值
print(donkey)
food_info.sort_values('Sodium_(mg)',inplace = True) #升序
print(food_info['Sodium_(mg)'])
food_info.sort_values('Sodium_(mg)',inplace = True,ascending=False) #降序
print(food_info['Sodium_(mg)'])
2.索引的计算和数据预处理
import pandas as pd #读取文件
import numpy as np
titanic_survival = pd.read_csv('titanic_train.csv')
titanic_survival.head()
age = titanic_survival['Age'] #输出0-10个age
print(age.loc[0:10])
age_is_null = pd.isnull(age) #判断age是否为空
print(age_is_null)
age_true_null = age[age_is_null] #输出age为空的元素
print(age_true_null)
age_true_num = len(age_true_null) #统计age为空的数量
print(age_true_num)
mean_age = titanic_survival['Age'][age_is_null == False] #计算年龄的平均值
correct_mean_age = sum(mean_age)/len(mean_age)
print(correct_mean_age)
correct_mean_age = titanic_survival['Age'].mean() #计算年龄的平均值
print(correct_mean_age)
passager = [1,2,3] #求1 2 3等座的平均票价
gares = {}
for this_class in passager:
pclass = titanic_survival[titanic_survival['Pclass'] == this_class]
fares = pclass['Fare']
mean = fares.mean()
gares[this_class] = mean
print(gares)
donkey = titanic_survival.pivot_table(index='Pclass',values = 'Fare',aggfunc = np.mean) #求1 2 3等座的平均票价 简便方法
print(donkey)
donkey = titanic_survival.pivot_table(index='Pclass',values = ['Fare','Age'],aggfunc = np.mean) #求1 2 3等座的平均票价和年龄
print(donkey)
drop1 = titanic_survival.dropna(axis=1) #删除列缺失的表 通常不会这样操作因为会丢失变量
drop2 = titanic_survival.dropna(axis=0,subset=['Age','Sex']) #删除行缺失的元素
print(drop2)
dingwei = titanic_survival.loc[83,'Age'] #定位第83的age是多大
print(dingwei)
#排序
donkey = titanic_survival.sort_values('Age',ascending=False) #降序排序
print(donkey[0:10])
donkey1 = donkey.reset_index(drop=True) #从0开始重新设置前面的排序数字
print(donkey1.loc[0:10])
3自定义函数
def hundre(colume): #输出第100行所有的数值
donkey = colume.loc[99]
return donkey
hundre = titanic_survival.apply(hundre)
print(hundre)
def donkey(colume): #输出数值为空的每个变量里面的数量
donkey1 = pd.isnull(colume)
null = colume[donkey1]
return len(null)
count = titanic_survival.apply(donkey)
print(count)
def which_class(row): #每一行数据中的座位等级
pclass = row['Pclass']
if pd.isnull(pclass):
return 'Unknuwn'
elif pclass == 1:
return 'First'
elif pclass ==2:
return 'Scond'
else:
return 'Third'
classes = titanic_survival.apply(which_class,axis=1)
print(classes)
import pandas as pd
donkey = pd.read_csv('fandango_score_comparison.csv')
film = donkey['FILM']
print(type(film))
print(film.loc[0:5])
donkey1 = donkey['RottenTomatoes']
print(donkey1[0:5])
from pandas import Series
donkey = film.values #将FILM中的值赋值给donkey
print(type(donkey)) #numpy类型
print(donkey)
donkey2 = donkey1.values #将RottebTomatoes的值赋给donkey2
print(donkey2)