import numpy as np
import pandas as pd
import os
df = pd. read_csv( 'train.csv' )
df. head( )
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
os. getcwd( )
'C:\\Users\\royryanwang\\Desktop\\DW数分'
pd. read_table( 'train.csv' )
PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 0 1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/... 1 2,1,1,"Cumings, Mrs. John Bradley (Florence Br... 2 3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,S... 3 4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May ... 4 5,0,3,"Allen, Mr. William Henry",male,35,0,0,3... ... ... 886 887,0,2,"Montvila, Rev. Juozas",male,27,0,0,21... 887 888,1,1,"Graham, Miss. Margaret Edith",female,... 888 889,0,3,"Johnston, Miss. Catherine Helen ""Car... 889 890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,11... 890 891,0,3,"Dooley, Mr. Patrick",male,32,0,0,3703...
891 rows × 1 columns
chunker = pd. read_csv( 'train.csv' , chunksize= 1000 )
chunker
<pandas.io.parsers.TextFileReader at 0x22317586f48>
将表头改成中文,索引改为乘客ID
df = pd. read_csv( 'train.csv' , names= [ '乘客ID' , '是否幸存' , '仓位等级' , '姓名' , '性别' , '年龄' , '兄弟姐 妹个数' , '父母子女个数' , '船票信息' , '票价' , '客舱' , '登船港口' ] , index_col= '乘客ID' , header= 0 )
df. head( )
是否幸存 仓位等级 姓名 性别 年龄 兄弟姐 妹个数 父母子女个数 船票信息 票价 客舱 登船港口 乘客ID 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
dataframe 更改行列名的其他方法
from pandas import DataFrame, Series
data = DataFrame( { "a" : [ 1 , 2 , 3 , 4 ] , "b" : [ 4 , 5 , 6 , 7 ] } )
data. columns = [ "c" , "d" ]
from pandas import DataFrame, Series
data = DataFrame( { "a" : [ 1 , 2 , 3 , 4 ] , "b" : [ 4 , 5 , 6 , 7 ] } )
data. rename( columns= { "a" : "c" , "b" : "d" } )
from pandas import DataFrame, Series
data = DataFrame( { "a" : [ 1 , 2 , 3 , 4 ] , "b" : [ 4 , 5 , 6 , 7 ] } )
data. insert( 0 , 'c' , data. pop( 'a' ) )
data. insert( 1 , 'd' , data. pop( 'b' ) )
d= { 'one' : { 'a' : 1 , 'b' : 2 , 'c' : 3 , 'd' : 4 } , 'two' : { 'a' : 5 , 'b' : 6 , 'c' : 7 , 'd' : 8 } , 'three' : { 'a' : 9 , 'b' : 10 , 'c' : 11 , 'd' : 12 } }
df= pd. DataFrame( d)
print ( df)
df. rename( index= { 'a' : 'aa' , 'b' : 'bb' } , inplace= True )
print ( df)
one two three
a 1 5 9
b 2 6 10
c 3 7 11
d 4 8 12
one two three
aa 1 5 9
bb 2 6 10
c 3 7 11
d 4 8 12
查看数据的基本信息
df. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 是否幸存 891 non-null int64
1 仓位等级 891 non-null int64
2 姓名 891 non-null object
3 性别 891 non-null object
4 年龄 714 non-null float64
5 兄弟姐 妹个数 891 non-null int64
6 父母子女个数 891 non-null int64
7 船票信息 891 non-null object
8 票价 891 non-null float64
9 客舱 204 non-null object
10 登船港口 889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
观察表格前10行的数据和后15行的数据
df. head( 10 )
是否幸存 仓位等级 姓名 性别 年龄 兄弟姐 妹个数 父母子女个数 船票信息 票价 客舱 登船港口 乘客ID 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NaN S 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C
df. tail( 15 )
是否幸存 仓位等级 姓名 性别 年龄 兄弟姐 妹个数 父母子女个数 船票信息 票价 客舱 登船港口 乘客ID 877 0 3 Gustafsson, Mr. Alfred Ossian male 20.0 0 0 7534 9.8458 NaN S 878 0 3 Petroff, Mr. Nedelio male 19.0 0 0 349212 7.8958 NaN S 879 0 3 Laleff, Mr. Kristo male NaN 0 0 349217 7.8958 NaN S 880 1 1 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) female 56.0 0 1 11767 83.1583 C50 C 881 1 2 Shelley, Mrs. William (Imanita Parrish Hall) female 25.0 0 1 230433 26.0000 NaN S 882 0 3 Markun, Mr. Johann male 33.0 0 0 349257 7.8958 NaN S 883 0 3 Dahlberg, Miss. Gerda Ulrika female 22.0 0 0 7552 10.5167 NaN S 884 0 2 Banfield, Mr. Frederick James male 28.0 0 0 C.A./SOTON 34068 10.5000 NaN S 885 0 3 Sutehall, Mr. Henry Jr male 25.0 0 0 SOTON/OQ 392076 7.0500 NaN S 886 0 3 Rice, Mrs. William (Margaret Norton) female 39.0 0 5 382652 29.1250 NaN Q 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q
判断数据是否为空,为空的地方返回True,其余地方 返回False
df. isnull( ) . head( )
是否幸存 仓位等级 姓名 性别 年龄 兄弟姐 妹个数 父母子女个数 船票信息 票价 客舱 登船港口 乘客ID 1 False False False False False False False False False True False 2 False False False False False False False False False False False 3 False False False False False False False False False True False 4 False False False False False False False False False False False 5 False False False False False False False False False True False
对于一个数据,还可以从哪些方面来观察?
可以从数据本身去看,比如对于此数据,我们可以分别统计仓位各等级有多少人,男女各有多少人,平均年龄是多少,平均兄弟姐妹,父母子女个数,票价。
1、分组groupby Pandas中最为常用和有效的分组函数。
1)按列分组
注意以下使用groupby()函数生成的group1是一个中间分组变量,为GroupBy类型。
既可依据单个列名’key1’进行为分组,也可依据多个列名[‘key1’,‘key2’]进行分组。
2)按分组统计 在分组group1、group2上应用size()、sum()、count()等统计函数,能分别统计分组数量、不同列的分组和、不同列的分组数量。
详情参加此CSDN 博客 https://blog.csdn.net/elecjack/article/details/50760736
group_level = df. groupby( '仓位等级' )
group_level. sum ( )
是否幸存 年龄 兄弟姐 妹个数 父母子女个数 票价 仓位等级 1 136 7111.42 90 77 18177.4125 2 87 5168.83 74 70 3801.8417 3 119 8924.92 302 193 6714.6951
group_gender = df. groupby( '性别' ) . sum ( )
group_gender
是否幸存 仓位等级 年龄 兄弟姐 妹个数 父母子女个数 票价 性别 female 233 678 7286.00 218 204 13966.6628 male 109 1379 13919.17 248 136 14727.2865
print ( df[ "年龄" ] . mean( ) )
29.69911764705882
print ( df[ "父母子女个数" ] . mean( ) )
0.38159371492704824
加载并做出改变的数据,在工作目录下保存为一 个新文件train_chinese.csv
df. to_csv( 'train_chinese.csv' )