开始之前,导入numpy、pandas包和数据
import numpy as np
import pandas as pd
text = pd. read_csv( 'data/train-left-up.csv' )
text. head( )
PassengerId Survived Pclass Name 0 1 0 3 Braund, Mr. Owen Harris 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 2 3 1 3 Heikkinen, Miss. Laina 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 4 5 0 3 Allen, Mr. William Henry
2 第二章:数据重构
2.4 数据的合并
2.4.1 任务一:将data文件夹里面的所有数据都载入,观察数据的之间的关系
text_left_up = pd. read_csv( "data/train-left-up.csv" )
text_left_down = pd. read_csv( "data/train-left-down.csv" )
text_right_up = pd. read_csv( "data/train-right-up.csv" )
text_right_down = pd. read_csv( "data/train-right-down.csv" )
from IPython. display import display
display( text_left_up. head( ) )
display( text_left_down. head( ) )
display( text_right_up. head( ) )
display( text_right_down. head( ) )
PassengerId Survived Pclass Name 0 1 0 3 Braund, Mr. Owen Harris 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 2 3 1 3 Heikkinen, Miss. Laina 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 4 5 0 3 Allen, Mr. William Henry
PassengerId Survived Pclass Name 0 440 0 2 Kvillner, Mr. Johan Henrik Johannesson 1 441 1 2 Hart, Mrs. Benjamin (Esther Ada Bloomfield) 2 442 0 3 Hampe, Mr. Leon 3 443 0 3 Petterson, Mr. Johan Emil 4 444 1 2 Reynaldo, Ms. Encarnacion
Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 male 22.0 1.0 0.0 A/5 21171 7.2500 NaN S 1 female 38.0 1.0 0.0 PC 17599 71.2833 C85 C 2 female 26.0 0.0 0.0 STON/O2. 3101282 7.9250 NaN S 3 female 35.0 1.0 0.0 113803 53.1000 C123 S 4 male 35.0 0.0 0.0 373450 8.0500 NaN S
Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 male 31.0 0 0 C.A. 18723 10.500 NaN S 1 female 45.0 1 1 F.C.C. 13529 26.250 NaN S 2 male 20.0 0 0 345769 9.500 NaN S 3 male 25.0 1 0 347076 7.775 NaN S 4 female 28.0 0 0 230434 13.000 NaN S
【提示】结合之前我们加载的train.csv数据,大致预测一下上面的数据是什么
把数据切分成了四个部分,左上左下右上右下,拼在一起就是原来的数据
2.4.2:任务二:使用concat方法:将数据train-left-up.csv和train-right-up.csv横向合并为一张表,并保存这张表为result_up
list_up = [ text_left_up, text_right_up]
result_up = pd. concat( list_up, axis= 1 )
result_up. head( )
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1.0 0.0 3.0 Braund, Mr. Owen Harris male 22.0 1.0 0.0 A/5 21171 7.2500 NaN S 1 2.0 1.0 1.0 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1.0 0.0 PC 17599 71.2833 C85 C 2 3.0 1.0 3.0 Heikkinen, Miss. Laina female 26.0 0.0 0.0 STON/O2. 3101282 7.9250 NaN S 3 4.0 1.0 1.0 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1.0 0.0 113803 53.1000 C123 S 4 5.0 0.0 3.0 Allen, Mr. William Henry male 35.0 0.0 0.0 373450 8.0500 NaN S
concat(objs, axis=0, join=‘outer’, join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True):
concat方法可以指定按某个轴进行连接,也可以指定连接的方式join(outer,inner只有这两种)。与数据库不同的是concat不会去重,要达到去重的效果可以使用drop_duplicates方法
eg:
from pandas import Series, DataFrame, concat
df1 = DataFrame( { 'city' : [ 'Chicago' , 'San Francisco' , 'New York City' ] , 'rank' : range ( 1 , 4 ) } )
df2 = DataFrame( { 'city' : [ 'Chicago' , 'Boston' , 'Los Angeles' ] , 'rank' : [ 1 , 4 , 5 ] } )
print ( '按轴进行内连接\r\n' , concat( [ df1, df2] , join= "inner" , axis= 1 ) )
print ( '进行外连接并指定keys(行索引)\r\n' , concat( [ df1, df2] , keys= [ 'a' , 'b' ] ) )
print ( '去重后\r\n' , concat( [ df1, df2] , ignore_index= True ) . drop_duplicates( ) )
按轴进行内连接
city rank city rank
0 Chicago 1 Chicago 1
1 San Francisco 2 Boston 4
2 New York City 3 Los Angeles 5
进行外连接并指定keys(行索引)
city rank
a 0 Chicago 1
1 San Francisco 2
2 New York City 3
b 0 Chicago 1
1 Boston 4
2 Los Angeles 5
去重后
city rank
0 Chicago 1
1 San Francisco 2
2 New York City 3
4 Boston 4
5 Los Angeles 5
2.4.3 任务三:使用concat方法:将train-left-down和train-right-down横向合并为一张表,并保存这张表为result_down。然后将上边的result_up和result_down纵向合并为result。
list_down= [ text_left_down, text_right_down]
result_down = pd. concat( list_down, axis= 1 )
result = pd. concat( [ result_up, result_down] )
result. head( )
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1.0 0.0 3.0 Braund, Mr. Owen Harris male 22.0 1.0 0.0 A/5 21171 7.2500 NaN S 1 2.0 1.0 1.0 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1.0 0.0 PC 17599 71.2833 C85 C 2 3.0 1.0 3.0 Heikkinen, Miss. Laina female 26.0 0.0 0.0 STON/O2. 3101282 7.9250 NaN S 3 4.0 1.0 1.0 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1.0 0.0 113803 53.1000 C123 S 4 5.0 0.0 3.0 Allen, Mr. William Henry male 35.0 0.0 0.0 373450 8.0500 NaN S
2.4.4 任务四:使用DataFrame自带的方法join方法和append:完成任务二和任务三的任务
resul_up = text_left_up. join( text_right_up)
result_down = text_left_down. join( text_right_down)
result = result_up. append( result_down)
result. head( )
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1.0 0.0 3.0 Braund, Mr. Owen Harris male 22.0 1.0 0.0 A/5 21171 7.2500 NaN S 1 2.0 1.0 1.0 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1.0 0.0 PC 17599 71.2833 C85 C 2 3.0 1.0 3.0 Heikkinen, Miss. Laina female 26.0 0.0 0.0 STON/O2. 3101282 7.9250 NaN S 3 4.0 1.0 1.0 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1.0 0.0 113803 53.1000 C123 S 4 5.0 0.0 3.0 Allen, Mr. William Henry male 35.0 0.0 0.0 373450 8.0500 NaN S
2.4.5 任务五:使用Panads的merge方法和DataFrame的append方法:完成任务二和任务三的任务
result_up = pd. merge( text_left_up, text_right_up, left_index= True , right_index= True )
result_down = pd. merge( text_left_down, text_right_down, left_index= True , right_index= True )
result = resul_up. append( result_down)
result. head( )
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1.0 0.0 A/5 21171 7.2500 NaN S 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1.0 0.0 PC 17599 71.2833 C85 C 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0.0 0.0 STON/O2. 3101282 7.9250 NaN S 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1.0 0.0 113803 53.1000 C123 S 4 5 0 3 Allen, Mr. William Henry male 35.0 0.0 0.0 373450 8.0500 NaN S
【思考】对比merge、join以及concat的方法的不同以及相同。思考一下在任务四和任务五的情况下,为什么都要求使用DataFrame的append方法,如何只要求使用merge或者join可不可以完成任务四和任务五呢?
2.4.6 任务六:完成的数据保存为result.csv
result. to_csv( 'result.csv' )
总结:合并dataframe之merge,join,append,concat方法
参考:https://blog.csdn.net/milton2017/article/details/54406482/ https://blog.csdn.net/weixin_43745169/article/details/103595710?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522159816813219724843327476%2522%252C%2522scm%2522%253A%252220140713.130102334.pc%255Fall.%2522%257D&request_id=159816813219724843327476&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2all first_rank_ecpm_v3~rank_business_v1-2-103595710.ecpm_v3_rank_business_v1&utm_term=merge%2Cappend%2Cjoin%2Cconca&spm=1018.2118.3001.4187 merge用法: https://blog.csdn.net/brucewong0516/article/details/82707492
1.merge
pd. merge( left, right, how= 'inner' , on= None , left_on= None , right_on= None ,
left_index= False , right_index= False , sort= True ,
suffixes= ( '_x' , '_y' ) , copy= True , indicator= False )
left: 拼接的左侧DataFrame对象
right: 拼接的右侧DataFrame对象
on: 要加入的列或索引级别名称。 必须在左侧和右侧DataFrame对象中找到。 如果未传递且left_index和right_index为False,则DataFrame中的列的交集将被推断为连接键。
left_on:左侧DataFrame中的列或索引级别用作键。 可以是列名,索引级名称,也可以是长度等于DataFrame长度的数组。
right_on: 左侧DataFrame中的列或索引级别用作键。 可以是列名,索引级名称,也可以是长度等于DataFrame长度的数组。
left_index: 如果为True,则使用左侧DataFrame中的索引(行标签)作为其连接键。 对于具有MultiIndex(分层)的DataFrame,级别数必须与右侧DataFrame中的连接键数相匹配。
right_index: 与left_index功能相似。
how: One of ‘left’, ‘right’, ‘outer’, ‘inner’. 默认inner。inner是取交集,outer取并集。比如left:[‘A’,‘B’,‘C’];right[’A’,‘C’,‘D’];inner取交集的话,left中出现的A会和right中出现的买一个A进行匹配拼接,如果没有是B,在right中没有匹配到,则会丢失。'outer’取并集,出现的A会进行一一匹配,没有同时出现的会将缺失的部分添加缺失值。
sort: 按字典顺序通过连接键对结果DataFrame进行排序。 默认为True,设置为False将在很多情况下显着提高性能。
suffixes: 用于重叠列的字符串后缀元组。 默认为(‘x’,’ y’)。
copy: 始终从传递的DataFrame对象复制数据(默认为True),即使不需要重建索引也是如此。
indicator:将一列添加到名为_merge的输出DataFrame,其中包含有关每行源的信息。 _merge是分类类型,并且对于其合并键仅出现在“左”DataFrame中的观察值,取得值为left_only,对于其合并键仅出现在“右”DataFrame中的观察值为right_only,并且如果在两者中都找到观察点的合并键,则为left_only。
import pandas as pd
df1= pd. DataFrame( { 'key' : [ 'a' , 'b' , 'a' , 'b' , 'b' ] , 'value1' : range ( 5 ) } )
df2= pd. DataFrame( { 'key' : [ 'a' , 'c' , 'c' , 'c' , 'c' ] , 'value2' : range ( 5 ) } )
display( df1, df2, pd. merge( df1, df2) )
mein= pd. merge( df1, df2, on= 'key' , how= 'inner' )
meou= pd. merge( df1, df2, how= 'outer' )
display( mein, meou)
key value1 value2 0 a 0.0 0.0 1 a 2.0 0.0 2 b 1.0 NaN 3 b 3.0 NaN 4 b 4.0 NaN 5 c NaN 1.0 6 c NaN 2.0 7 c NaN 3.0 8 c NaN 4.0
pd. merge( df1, df2, how= 'left' )
key value1 value2 0 a 0 0.0 1 b 1 NaN 2 a 2 0.0 3 b 3 NaN 4 b 4 NaN
pd. merge( df1, df2, how= 'right' )
key value1 value2 0 a 0.0 0 1 a 2.0 0 2 c NaN 1 3 c NaN 2 4 c NaN 3 5 c NaN 4
df3= pd. DataFrame( { 'lkey' : [ 'a' , 'b' , 'a' , 'b' , 'b' ] , 'data1' : range ( 5 ) } )
df4= pd. DataFrame( { 'rkey' : [ 'a' , 'c' , 'c' , 'c' , 'c' ] , 'data2' : range ( 5 ) } )
df5= pd. merge( df3, df4, left_on= 'lkey' , right_on= 'rkey' )
display( df3, df4, df5)
lkey data1 rkey data2 0 a 0 a 0 1 a 2 a 0
df6= pd. merge( df3, df4, left_on= 'lkey' , right_on= 'rkey' , how= 'outer' )
df7= pd. merge( df3, df4, left_on= 'lkey' , right_on= 'rkey' , how= 'left' )
df8= pd. merge( df3, df4, left_on= 'lkey' , right_on= 'rkey' , how= 'right' )
display( df6, df7, df8)
lkey data1 rkey data2 0 a 0.0 a 0.0 1 a 2.0 a 0.0 2 b 1.0 NaN NaN 3 b 3.0 NaN NaN 4 b 4.0 NaN NaN 5 NaN NaN c 1.0 6 NaN NaN c 2.0 7 NaN NaN c 3.0 8 NaN NaN c 4.0
lkey data1 rkey data2 0 a 0 a 0.0 1 b 1 NaN NaN 2 a 2 a 0.0 3 b 3 NaN NaN 4 b 4 NaN NaN
lkey data1 rkey data2 0 a 0.0 a 0 1 a 2.0 a 0 2 NaN NaN c 1 3 NaN NaN c 2 4 NaN NaN c 3 5 NaN NaN c 4
df9= pd. DataFrame( np. arange( 12 ) . reshape( 3 , 4 ) , index= list ( 'abc' ) , columns= [ 'v1' , 'v2' , 'v3' , 'v4' ] )
df10= pd. DataFrame( np. arange( 12 , 24 , 1 ) . reshape( 3 , 4 ) , index= list ( 'abd' ) , columns= [ 'v5' , 'v6' , 'v7' , 'v8' ] )
df11= pd. merge( df5, df6, left_index= True , right_index= True )
display( df9, df10, df11)
v1 v2 v3 v4 a 0 1 2 3 b 4 5 6 7 c 8 9 10 11
v5 v6 v7 v8 a 12 13 14 15 b 16 17 18 19 d 20 21 22 23
lkey_x data1_x rkey_x data2_x lkey_y data1_y rkey_y data2_y 0 a 0 a 0 a 0.0 a 0.0 1 a 2 a 0 a 2.0 a 0.0
2.join方法
result = left. join( right, on= [ 'key1' , 'key2' ] , how= 'inner' )
result= pd. merge( left, right, on= [ 'key1' , 'key2' ] , how= 'inner' )
3.append方法
append( other, ignore_index= False , verify_integrity= False , sort= None )
d= { 'gene' : { 'a' : 'gene1' , 'b' : 'gene2' , 'c' : 'gene3' , 'd' : 'gene4' } , 'expression' : { 'a' : '0' , 'b' : '3' , 'c' : '4' , 'd' : '9' } }
d1= { 'gene' : { 'a' : 'gene4' , 'b' : 'gene5' , 'c' : 'gene6' , 'd' : 'gene7' } , 'expression' : { 'a' : '9' , 'b' : '2' , 'c' : '3' , 'd' : '3' } , 'description' : { 'a' : 'transposon element' , 'b' : 'nuclear genes' , 'c' : 'retrotransposon' , 'd' : 'unknown' } }
df= pd. DataFrame( d)
df1= pd. DataFrame( d1)
df2= df. append( df1, sort= False )
display( df, df1, df2)
gene expression a gene1 0 b gene2 3 c gene3 4 d gene4 9
gene expression description a gene4 9 transposon element b gene5 2 nuclear genes c gene6 3 retrotransposon d gene7 3 unknown
gene expression description a gene1 0 NaN b gene2 3 NaN c gene3 4 NaN d gene4 9 NaN a gene4 9 transposon element b gene5 2 nuclear genes c gene6 3 retrotransposon d gene7 3 unknown
df2= df. append( df1, sort= True )
df3= df. append( df1, sort= False , ignore_index= True )
display( df2, df3)
description expression gene a NaN 0 gene1 b NaN 3 gene2 c NaN 4 gene3 d NaN 9 gene4 a transposon element 9 gene4 b nuclear genes 2 gene5 c retrotransposon 3 gene6 d unknown 3 gene7
gene expression description 0 gene1 0 NaN 1 gene2 3 NaN 2 gene3 4 NaN 3 gene4 9 NaN 4 gene4 9 transposon element 5 gene5 2 nuclear genes 6 gene6 3 retrotransposon 7 gene7 3 unknown
2.5 换一种角度看数据
2.5.1 任务一:将我们的数据变为Series类型的数据
text = pd. read_csv( 'result.csv' )
text. head( )
unit_result= text. stack( ) . head( 20 )
unit_result. head( )
0 Unnamed: 0 0
PassengerId 1
Survived 0
Pclass 3
Name Braund, Mr. Owen Harris
dtype: object
python pandas stack和unstack函数
https://www.cnblogs.com/bambipai/p/7658311.html
表格在行列方向上均有索引(类似于DataFrame),花括号结构只有“列方向”上的索引(类似于层次化的Series),结构更加偏向于堆叠(Series-stack,方便记忆)。stack函数会将数据从”表格结构“变成”花括号结构“,即将其行索引变成列索引,反之,unstack函数将数据从”花括号结构“变成”表格结构“,即要将其中一层的列索引变成行索引。
unit_result. to_csv( 'unit_result.csv' )
test = pd. read_csv( 'unit_result.csv' )
test. head( )