titanic.csv文件内容
基本操作
df = pd.read_csv('./data/titanic.csv')
df.head(6)
df.index
df.columns
df.dtypes
df.values
df['Age'][:5]
df = df.set_index('Name')
df['Age'][:5]
age = df[‘Age’]
age.mean()
age.min()
age.max()
#df.describe()只会统计数值型特征,df.describe(include=‘all’),统计所有,
df.describe() df.describe(include=[np.number]) df.describe(include=[np.object])
#Seies数据结构
s1.replace(to_replace = 100,value = 101,inplace = True)
s1.rename(index = {'a':'A'},inplace = True)
#第1个样本
df.iloc[0]
#索引为a的样本,特征为A的值
df.loc['a']['A']
#增加:样本拼接,属性拼接:axis = 1
df3 = pd.concat([df,df2],axis = 0)
#k2升序排列
data.sort_values(by='k2')
#将data按group降序,data升序进行排列
data.sort_values(by=['group','data'],ascending = [False,True],inplace=True)
#删除重复数据
data.drop_duplicates()
#删除k1的重复数据
data.drop_duplicates(subset='k1')
#枚举转换,将food按food2Upper进行转换并新增一列upper
food2Upper = {
'A1':'A',
'A2':'A',
'B1':'B',
'B2':'B',
'B3':'B',
'C1':'C',
'C2':'C'
}
data['upper'] = data['food'].map(food2Upper)
#删除行,删除列:axis=1
df5.drop(['j'],axis=0,inplace = True)
#替换
data.replace(np.nan,9,inplace=True)
#分箱与统计
group_names = ['Yonth','Mille','Old']
ages = [15,18,20,21,22,34,41,52,63,79]
bins = [10,20,40,80]
bins_res = pd.cut(ages,bins, labels = group_names)
bins_res
***
[Yonth, Yonth, Yonth, Mille, Mille, Mille, Old, Old, Old, Old]
Categories (3, object): [Yonth < Mille < Old]
***
pd.value_counts(bins_res)
***
(10, 40] 6
(40, 80] 4
dtype: int64
***
#判断空值每一项
df.isnull()
#按列判断,行判断增加参数:axis = 1
df.isnull().any()
#缺失值填充
df.fillna(5)
#显示存在空值的样本
df[df.isnull().any(axis = 1)]
Pandas索引
- loc 用label来去定位
- iloc 用position来去定位
#前5个样本,第1,2个特征
df.iloc[0:5,1:3]
df = df.set_index('Name')
##索引为Heikkinen, Miss. Laina的Fare特征
df.loc['Heikkinen, Miss. Laina','Fare']
df['Fare'] > 40
***
Name
Braund, Mr. Owen Harris False
Cumings, Mrs. John Bradley (Florence Briggs Thayer) True
Heikkinen, Miss. Laina True
Futrelle, Mrs. Jacques Heath (Lily May Peel) True
***
#范围Fare大于40的前5条记录
df[df['Fare'] > 40][:5]
#男性年龄的平均值
df.loc[df['Sex'] == 'male','Age'].mean()
#年龄大于70的记录数
(df['Age'] > 70).sum()
#select 'A' from df
df.select(lambda x:x=='A',axis='columns')
#根据条件进行查找
df.query('(a<b) & (b<c)')
Groupby
#按性别统计年龄的平均值,也可以是value_counts()
df.groupby('Sex')['Age'].mean()
df.groupby('Sex').sum()
grouped = df.groupby(['A','B'])
grouped.size()
Merge
#how='outer'(全连接),左连接='left' ,右连接=‘right’
res = pd.merge(left, right, on = ['key1', 'key2'], how = 'outer', indicator = True)
**
A B key1 key2 C D _merge
0 A0 B0 K0 K0 C0 D0 both
1 A1 B1 K1 K1 C1 D1 both
2 A2 B2 K2 K2 C2 D2 both
3 A3 B3 K3 K3 NaN NaN left_only
4 NaN NaN K3 K4 C3 D3 right_only
**
#Join,left中的key是属性,right中的key是索引
result = left.join(right, on='key')
显示设置
#设置显示精度
pd.set_option('display.precision',2)
pd.set_option('display.max_rows',6)
pd.set_option('display.max_columns',30)
pd.set_option('display.max_colwidth',100)
Pivot用法
#根据Pclass为索引统计年龄小于18岁的男性和女性的平均生存几率,其中aggfunc还可为max、count等
df['Underaged'] = df['Age'] <= 18
df.pivot_table(index = 'Pclass',columns='Sex',values='Survived',aggfunc='mean')
***
Sex female male
Underaged
False 0.760163 0.167984
True 0.676471 0.338028
***
时间操作
#以Time为索引,格式:2009-01-01 00:00:00
data[('2012-01-01 09:00'):('2012-01-01 19:00')]
data['2013']
data['2012-01':'2012-03']
data[data.index.month == 1]
data[(data.index.hour > 8) & (data.index.hour <12)]
data.between_time('08:00','12:00')
#按天统计平均值,mean可以替换为max、min等,D可以改成3D(3天统计),M(按月统计)
data.resample('D').mean().head()
***
L06_347 LS06_347 LS06_348 month
Time
2009-01-01 0.125010 0.092281 0.016635 1
2009-01-02 0.124146 0.095781 0.016406 1
2009-01-03 0.113562 0.085542 0.016094 1
2009-01-04 0.140198 0.102708 0.017323 1
2009-01-05 0.128812 0.104490 0.018167 1
***
Apply
#返回第100个样本信息
def hundredth_row(columns):
item = columns.iloc[99]
return item
hundredth_row = titanic.apply(hundredth_row)
#空值统计
def not_null_count(columns):
columns_null = pd.isnull(columns)
null = columns[columns_null]
return len(null)
columns_null_count = titanic.apply(not_null_count)
#枚举映射
def which_class(row):
pclass = row['Pclass']
if pd.isnull(pclass):
return 'Unknow'
elif pclass == 1:
return 'First class'
elif pclass == 2:
return 'Second class'
elif pclass == 3:
return 'Third class'
classes = titanic.apply(which_class,axis = 1)
#条件判断
def is_minor(row):
if row['Age'] < 18:
return True
else:
return False
minors = titanic.apply(is_minor,axis = 1)