1.链接,concat
1.1行链接
import pandas as pd
df1 = pd.read_csv('./data/concat_1.csv')
print(df1)
df2 = pd.read_csv('./data/concat_2.csv')
print(df2)
df3 = pd.read_csv('./data/concat_3.csv')
print(df3)
A B C D
0 a0 b0 c0 d0
1 a1 b1 c1 d1
2 a2 b2 c2 d2
3 a3 b3 c3 d3
A B C D
0 a4 b4 c4 d4
1 a5 b5 c5 d5
2 a6 b6 c6 d6
3 a7 b7 c7 d7
A B C D
0 a8 b8 c8 d8
1 a9 b9 c9 d9
2 a10 b10 c10 d10
3 a11 b11 c11 d11
row_concat = pd.concat([df1, df2, df3])
print(row_concat)
A B C D
0 a0 b0 c0 d0
1 a1 b1 c1 d1
2 a2 b2 c2 d2
3 a3 b3 c3 d3
0 a4 b4 c4 d4
1 a5 b5 c5 d5
2 a6 b6 c6 d6
3 a7 b7 c7 d7
0 a8 b8 c8 d8
1 a9 b9 c9 d9
2 a10 b10 c10 d10
3 a11 b11 c11 d11
print(row_concat.iloc[4,])
print(row_concat.iloc[4:6,])
A a4
B b4
C c4
D d4
Name: 0, dtype: object
A B C D
0 a4 b4 c4 d4
1 a5 b5 c5 d5
new_row_series = pd.Series(['n1','n2','n3','n4'])
print(new_row_series)
0 n1
1 n2
2 n3
3 n4
dtype: object
print(pd.concat([df1, new_row_series]))
A B C D 0
0 a0 b0 c0 d0 NaN
1 a1 b1 c1 d1 NaN
2 a2 b2 c2 d2 NaN
3 a3 b3 c3 d3 NaN
0 NaN NaN NaN NaN n1
1 NaN NaN NaN NaN n2
2 NaN NaN NaN NaN n3
3 NaN NaN NaN NaN n4
new_row_df = pd.DataFrame([['n1','n2','n3','n4']],
columns=['A','B','C','D'])
print(new_row_df)
A B C D
0 n1 n2 n3 n4
print(pd.concat([df1,new_row_df]))
print(pd.concat([df1,new_row_df],ignore_index=True))
A B C D
0 a0 b0 c0 d0
1 a1 b1 c1 d1
2 a2 b2 c2 d2
3 a3 b3 c3 d3
0 n1 n2 n3 n4
A B C D
0 a0 b0 c0 d0
1 a1 b1 c1 d1
2 a2 b2 c2 d2
3 a3 b3 c3 d3
4 n1 n2 n3 n4
1.2列拼接
df1 = pd.read_csv('./data/concat_1.csv')
print(df1)
df2 = pd.read_csv('./data/concat_2.csv')
print(df2)
df3 = pd.read_csv('./data/concat_3.csv')
print(df3)
A B C D
0 a0 b0 c0 d0
1 a1 b1 c1 d1
2 a2 b2 c2 d2
3 a3 b3 c3 d3
A B C D
0 a4 b4 c4 d4
1 a5 b5 c5 d5
2 a6 b6 c6 d6
3 a7 b7 c7 d7
A B C D
0 a8 b8 c8 d8
1 a9 b9 c9 d9
2 a10 b10 c10 d10
3 a11 b11 c11 d11
col_concat = pd.concat([df1, df2, df3],axis=1)
print(col_concat)
A B C D A B C D A B C D
0 a0 b0 c0 d0 a4 b4 c4 d4 a8 b8 c8 d8
1 a1 b1 c1 d1 a5 b5 c5 d5 a9 b9 c9 d9
2 a2 b2 c2 d2 a6 b6 c6 d6 a10 b10 c10 d10
3 a3 b3 c3 d3 a7 b7 c7 d7 a11 b11 c11 d11
print(col_concat['B'])
B B B
0 b0 b4 b8
1 b1 b5 b9
2 b2 b6 b10
3 b3 b7 b11
# ADD NEW COL
col_concat['new_col_list'] = ['n1','n2','n3','n4']
print(col_concat)
A B C D A B C D A B C D new_col_list
0 a0 b0 c0 d0 a4 b4 c4 d4 a8 b8 c8 d8 n1
1 a1 b1 c1 d1 a5 b5 c5 d5 a9 b9 c9 d9 n2
2 a2 b2 c2 d2 a6 b6 c6 d6 a10 b10 c10 d10 n3
3 a3 b3 c3 d3 a7 b7 c7 d7 a11 b11 c11 d11 n4
# 用Series添加一列
col_concat['new_col_series'] = pd.Series(['n1','n2','n3','n4'])
print(col_concat)
A B C D A B C D A B C D new_col_list \
0 a0 b0 c0 d0 a4 b4 c4 d4 a8 b8 c8 d8 n1
1 a1 b1 c1 d1 a5 b5 c5 d5 a9 b9 c9 d9 n2
2 a2 b2 c2 d2 a6 b6 c6 d6 a10 b10 c10 d10 n3
3 a3 b3 c3 d3 a7 b7 c7 d7 a11 b11 c11 d11 n4
new_col_series
0 n1
1 n2
2 n3
3 n4
col_concat = pd.concat([df1,df2,df3],axis=1,ignore_index=True)
print(col_concat)
0 1 2 3 4 5 6 7 8 9 10 11
0 a0 b0 c0 d0 a4 b4 c4 d4 a8 b8 c8 d8
1 a1 b1 c1 d1 a5 b5 c5 d5 a9 b9 c9 d9
2 a2 b2 c2 d2 a6 b6 c6 d6 a10 b10 c10 d10
3 a3 b3 c3 d3 a7 b7 c7 d7 a11 b11 c11 d11
print(col_concat[2])
0 c0
1 c1
2 c2
3 c3
Name: 2, dtype: object
print(col_concat[2][1])
c1
print(col_concat[1:3])
0 1 2 3 4 5 6 7 8 9 10 11
1 a1 b1 c1 d1 a5 b5 c5 d5 a9 b9 c9 d9
2 a2 b2 c2 d2 a6 b6 c6 d6 a10 b10 c10 d10
# 拥有不同列的data_frame
df1 = pd.read_csv('./data/concat_1.csv')
df2 = pd.read_csv('./data/concat_2.csv')
df3 = pd.read_csv('./data/concat_3.csv')
df1.columns = ['A','B','C','D']
df2.columns = ['E','F','G','H']
df3.columns = ['A','C','F','H']
print(df1)
print(df2)
print(df3)
A B C D
0 a0 b0 c0 d0
1 a1 b1 c1 d1
2 a2 b2 c2 d2
3 a3 b3 c3 d3
E F G H
0 a4 b4 c4 d4
1 a5 b5 c5 d5
2 a6 b6 c6 d6
3 a7 b7 c7 d7
A C F H
0 a8 b8 c8 d8
1 a9 b9 c9 d9
2 a10 b10 c10 d10
3 a11 b11 c11 d11
col_concat = pd.concat([df1,df2,df3])
print(col_concat)
A B C D E F G H
0 a0 b0 c0 d0 NaN NaN NaN NaN
1 a1 b1 c1 d1 NaN NaN NaN NaN
2 a2 b2 c2 d2 NaN NaN NaN NaN
3 a3 b3 c3 d3 NaN NaN NaN NaN
0 NaN NaN NaN NaN a4 b4 c4 d4
1 NaN NaN NaN NaN a5 b5 c5 d5
2 NaN NaN NaN NaN a6 b6 c6 d6
3 NaN NaN NaN NaN a7 b7 c7 d7
0 a8 NaN b8 NaN NaN c8 NaN d8
1 a9 NaN b9 NaN NaN c9 NaN d9
2 a10 NaN b10 NaN NaN c10 NaN d10
3 a11 NaN b11 NaN NaN c11 NaN d11
/home/leon/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
"""Entry point for launching an IPython kernel.
2合并,merge
关系型数据库中的两个表之间的链接
# table1.field1 table.field2
# select table1.field1,table2.field2
# where table1.field1 = table2.field
# 1. one vs one
# 2. more vs one
# 3. more vs more
import pandas as pd
person = pd.read_csv('./data/survey_person.csv')
site = pd.read_csv('./data/survey_site.csv')
survey = pd.read_csv('./data/survey_survey.csv')
visited = pd.read_csv('./data/survey_visited.csv')
print(person)
print(site)
print(survey)
print(visited)
ident personal family
0 dyer William Dyer
1 pb Frank Pabodie
2 lake Anderson Lake
3 roe Valentina Roerich
4 danforth Frank Danforth
name lat long
0 DR-1 -49.85 -128.57
1 DR-3 -47.15 -126.72
2 MSK-4 -48.87 -123.40
taken person quant reading
0 619 dyer rad 9.82
1 619 dyer sal 0.13
2 622 dyer rad 7.80
3 622 dyer sal 0.09
4 734 pb rad 8.41
5 734 lake sal 0.05
6 734 pb temp -21.50
7 735 pb rad 7.22
8 735 NaN sal 0.06
9 735 NaN temp -26.00
10 751 pb rad 4.35
11 751 pb temp -18.50
12 751 lake sal 0.10
13 752 lake rad 2.19
14 752 lake sal 0.09
15 752 lake temp -16.00
16 752 roe sal 41.60
17 837 lake rad 1.46
18 837 lake sal 0.21
19 837 roe sal 22.50
20 844 roe rad 11.25
ident site dated
0 619 DR-1 1927-02-08
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22
visited_subset = visited.loc[[0,2],]
print(visited_subset)
ident site dated
0 619 DR-1 1927-02-08
2 734 DR-3 1939-01-07
# one to one
print(site)
print(visited_subset)
name lat long
0 DR-1 -49.85 -128.57
1 DR-3 -47.15 -126.72
2 MSK-4 -48.87 -123.40
ident site dated
0 619 DR-1 1927-02-08
2 734 DR-3 1939-01-07
merge1 = site.merge(visited_subset,left_on = 'name',right_on='site')
print(merge1)
name lat long ident site dated
0 DR-1 -49.85 -128.57 619 DR-1 1927-02-08
1 DR-3 -47.15 -126.72 734 DR-3 1939-01-07
merge多对一
print(site)
print(visited)
name lat long
0 DR-1 -49.85 -128.57
1 DR-3 -47.15 -126.72
2 MSK-4 -48.87 -123.40
ident site dated
0 619 DR-1 1927-02-08
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22
merge2 = site.merge(visited,left_on = 'name',right_on='site')
print(merge2)
name lat long ident site dated
0 DR-1 -49.85 -128.57 619 DR-1 1927-02-08
1 DR-1 -49.85 -128.57 622 DR-1 1927-02-10
2 DR-1 -49.85 -128.57 844 DR-1 1932-03-22
3 DR-3 -47.15 -126.72 734 DR-3 1939-01-07
4 DR-3 -47.15 -126.72 735 DR-3 1930-01-12
5 DR-3 -47.15 -126.72 751 DR-3 1930-02-26
6 DR-3 -47.15 -126.72 752 DR-3 NaN
7 MSK-4 -48.87 -123.40 837 MSK-4 1932-01-14
merge多对多
print(person)
print(survey)
ident personal family
0 dyer William Dyer
1 pb Frank Pabodie
2 lake Anderson Lake
3 roe Valentina Roerich
4 danforth Frank Danforth
taken person quant reading
0 619 dyer rad 9.82
1 619 dyer sal 0.13
2 622 dyer rad 7.80
3 622 dyer sal 0.09
4 734 pb rad 8.41
5 734 lake sal 0.05
6 734 pb temp -21.50
7 735 pb rad 7.22
8 735 NaN sal 0.06
9 735 NaN temp -26.00
10 751 pb rad 4.35
11 751 pb temp -18.50
12 751 lake sal 0.10
13 752 lake rad 2.19
14 752 lake sal 0.09
15 752 lake temp -16.00
16 752 roe sal 41.60
17 837 lake rad 1.46
18 837 lake sal 0.21
19 837 roe sal 22.50
20 844 roe rad 11.25
merge3 = person.merge(survey,left_on = 'ident',right_on='person')
print(merge3)
ident personal family taken person quant reading
0 dyer William Dyer 619 dyer rad 9.82
1 dyer William Dyer 619 dyer sal 0.13
2 dyer William Dyer 622 dyer rad 7.80
3 dyer William Dyer 622 dyer sal 0.09
4 pb Frank Pabodie 734 pb rad 8.41
5 pb Frank Pabodie 734 pb temp -21.50
6 pb Frank Pabodie 735 pb rad 7.22
7 pb Frank Pabodie 751 pb rad 4.35
8 pb Frank Pabodie 751 pb temp -18.50
9 lake Anderson Lake 734 lake sal 0.05
10 lake Anderson Lake 751 lake sal 0.10
11 lake Anderson Lake 752 lake rad 2.19
12 lake Anderson Lake 752 lake sal 0.09
13 lake Anderson Lake 752 lake temp -16.00
14 lake Anderson Lake 837 lake rad 1.46
15 lake Anderson Lake 837 lake sal 0.21
16 roe Valentina Roerich 752 roe sal 41.60
17 roe Valentina Roerich 837 roe sal 22.50
18 roe Valentina Roerich 844 roe rad 11.25
3.删除drop
import pandas as pd
df = pd.read_csv('./data/survey_visited.csv')
print(df)
ident site dated
0 619 DR-1 1927-02-08
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22
2.1删除行
删除单行
df2=df.drop(labels=0) # axis默认等于0,即按行删除,这里表示按行删除第0行
print(df2)
ident site dated
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22
删除多行
df3=df.drop(labels=[1,3],axis=0) # axis=0 表示按行删除,删除第1行和第3行
print(df)
print(df3)
ident site dated
0 619 DR-1 1927-02-08
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22
ident site dated
0 619 DR-1 1927-02-08
2 734 DR-3 1939-01-07
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22
要删除连续的多行可以用range(),删除连续的多列不能用此方法
df4=df.drop(labels=range(1,3),axis=0) # axis=0 表示按行删除,删除索引值是第1行至第3行的正行数据
print(df4)
ident site dated
0 619 DR-1 1927-02-08
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22
2.2删除列
删除单列
df5=df.drop(labels='site',axis=1) # axis=1 表示按列删除,删除gender列
print(df)
print(df5)
ident site dated
0 619 DR-1 1927-02-08
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22
ident dated
0 619 1927-02-08
1 622 1927-02-10
2 734 1939-01-07
3 735 1930-01-12
4 751 1930-02-26
5 752 NaN
6 837 1932-01-14
7 844 1932-03-22
删除指定的某几列
df6=df.drop(labels=['ident',"site"],axis=1) # axis=1 表示按列删除,删除gender、age列
print(df)
print(df6)
ident site dated
0 619 DR-1 1927-02-08
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22
dated
0 1927-02-08
1 1927-02-10
2 1939-01-07
3 1930-01-12
4 1930-02-26
5 NaN
6 1932-01-14
7 1932-03-22