import pandas as pd
import os
#导入数据
path=os.path.join(os.getcwd(),'欧洲杯+数据练习','exercise_data','train.csv')
df=pd.read_csv(path)
df.info()
df.head()
###设置字体
plt.rcParams['font.sans-serif']=['SimHei']
###将字符串转化为数字:将函数应用在列上
def sex_transform(sex):
if sex =='male' :
return 1
else:
return 0
df['Sex']=df['Sex'].apply(sex_transform)
##df['Sex']=df['Sex'].map({'male':1,'female':0})
Survived_0=df['Sex'][df.Survived==0]
Survived_1=df['Sex'][df.Survived==1]
###绘制直方图
import matplotlib.pyplot as plt
plt.figure(figsize=(20,8),dpi=80)
plt.hist([Survived_0,Survived_1],stacked=True,label=['Rescued','not saved'])
plt.xticks([-1,0,1,2],[-1,"F","M",2])
plt.legend()
###处理缺失值
df['Age']=df['Age'].fillna(df['Age'].mean())
###年龄分段
def age_transform(age):
if age<=18:
r=1
elif age<=40:
r=2
else :
r=3
return r
df['Age']=df['Age'].apply(age_transform)
plt.figure(figsize=(8,5),dpi=80)
Survived_0=df['Age'][df.Survived==0]
Survived_1=df['Age'][df.Survived==1]
plt.hist([Survived_0,Survived_1],label=['Rescued','not saved'])
plt.xticks([1,2,3],['child','youth','elderly'])
plt.title('Age_Survived')
plt.legend()
plt.show()
plt.figure(figsize=(8,5),dpi=80)
Survived_0=df['Pclass'][df.Survived==0]
Survived_1=df['Pclass'][df.Survived==1]
plt.hist([Survived_0,Survived_1],label=['Rescued','not saved'])
plt.xticks([1,2,3],['Upper','Middle','lower'])
plt.title('Pclass_Survived')
plt.legend()
plt.show()
###全部数据关于Sex、Age的分组
group_all=df.groupby(by=['Sex','Age'])['Sex'].count()
###Survived=1的数据关于Sex、Age的分组
df_survive=df[df.Survived==1]
group_all_survive=df_survive.groupby(by=['Sex','Age'])['Sex'].count()
###获取存活率
group_all_rate=group_all_survive/group_all
###绘制柱状图
bar=group_all_rate.plot.bar(title='性别年龄共同对生还率的影响')
###绘制每个柱子的文本
for b in bar.patches:###选择每个柱子,get_x获取位置x,get_height获取高度
bar.text(b.get_x(),b.get_height()*1.01,'%.2f%%'%(b.get_height()*100))
###将画图抽象为方法
###返回存活率
def group_rate(df,cols):
group_all=df.groupby(by=cols)[cols[0]].count()
df_survive=df[df.Survived==1]
group_all_survive=df_survive.groupby(by=cols)[cols[0]].count()
group_all_rate=group_all_survive/group_all
return group_all_rate###记得返回值,否则为空对象
####返回图像
def group_plot(df,title):
bar=df.plot.bar(title=title)
for b in bar.patches:###选择每个柱子,get_x获取位置x,get_height获取高度
bar.text(b.get_x(),b.get_height()*1.01,'%.2f%%'%(b.get_height()*100))
group_plot(group_rate(df,['Sex','Pclass']),title='性别等级共同对生还率的影响')
group_plot(group_rate(df,['Age','Pclass']),title='年龄等级共同对生还率的影响')
df['Age'].unique()###返回唯一值数组
df['Age'].nunique()###取值数量
df.isnull().sum()##每一列缺失值数量
df.notnull().sum()##每一列非缺失值数量
df['Age'].cumsum().head()###累计和或累计拼接字符串
df['Age'].idxmax()####获取数值最大所在的索引
df.describe()##获取数据概述
for (columnName,columnData) in df.describe().iteritems():
print('columnName:',columnName)###columnData字段名
print('column Contents:',columnData.values)##columnData为取值的series,.values转化为数组
df1=df.sample(n=7,replace=False)###replace=False为不放回抽样
df1=df['Name'].apply(lambda x:x+'QQ' )###将函数应用到列上
df1=pd.concat([df,df,df],axis=1)##纵向合并
pd.merge(left=df,right=df,on='PassengerId',how='outer')##按指定值合并
df.set_index(key='PassengerId')###指定索引
df.reindex()###以新的索引新建一个dataframe
df.sort_values(by=['Sex','Age'],ascending=[True,False])###排序
df.duplicated(subset=['Age'],keep ='first')###特定列是否重复,非第一次出现为false
df.filter(like='class',axis=1)##对行索引或列索引进行筛选,筛选索引含class的列
df.drop(labels='Age',axis=1,inplace=True)###删列
df.isnull().any(axis=0)####哪些列存在空
df.loc[df.Age>10]####loc取出布尔索引的行
df.loc[df.Age>60,'Age']=60###修改对应行
df.drop(label=index,axis=0,inplace=True)###删除行
df.query('Age>60 & Embarked=="S"')###做条件筛选
df.fillna(value='-1',inplace=True)###对空值进行填补
####字典映射列
Pclass_map={1:'Upper',2:'Middle',3:'lower'}
df.Pclass.map(Pclass_map)
df.Pclass.value_counts()###取值计数