目录
6.Pandas导入导出
6.1导入数据
import pandas as pd
# CSV文件不可以直接改后缀名,需要另存是保存为CSV格式
data = pd.read_csv('student.csv')
print("~~~~~~~~~~~~~~~~~~读取整个表格~~~~~~~~~~~~~~~~~~")
print(data)
# Student ID name age gender
# 0 1100 Kelly 22 Famle
# 1 1101 Clo 21 Famle
# 2 1102 Tilly 22 Famle
# 3 1103 Tony 24 Male
# 4 1104 David 20 Male
# 5 1105 Catty 22 Famle
# 6 1106 M 3 Famle
# 7 1107 N 43 Male
# 8 1108 A 13 Male
# 9 1109 S 12 Male
# 10 1110 David 33 Male
# 11 1111 Dw 3 Famle
# 12 1112 Q 23 Male
# 13 1113 W 11 Famle
print("~~~~~~~~~~~~~~~~~~前三行~~~~~~~~~~~~~~~~~~")
# 前三行
print(data.head(3))
print("~~~~~~~~~~~~~~~~~~后三行~~~~~~~~~~~~~~~~~~")
print(data.tail(3))
6.2导出数据
print("~~~~~~~~~~~~~~~~~~将资料存取成pickle~~~~~~~~~~~~~~~~~~")
data.to_pickle('student.pickle')
# 读取pickle文件并打印
print(pd.read_pickle('student.pickle'))
# Student ID name age gender
# 0 1100 Kelly 22 Famle
# 1 1101 Clo 21 Famle
# 2 1102 Tilly 22 Famle
# 3 1103 Tony 24 Male
# 4 1104 David 20 Male
# 5 1105 Catty 22 Famle
# 6 1106 M 3 Famle
# 7 1107 N 43 Male
# 8 1108 A 13 Male
# 9 1109 S 12 Male
# 10 1110 David 33 Male
# 11 1111 Dw 3 Famle
# 12 1112 Q 23 Male
# 13 1113 W 11 Famle
7.Pandas合并操作
7.1 Pandas 合并 concat
pdf1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'])
pdf2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd'])
pdf3 = pd.DataFrame(np.ones((3, 4)) * 2, columns=['a', 'b', 'c', 'd'])
print("~~~~~~~~~~~~~~~~~~打印pdf1~~~~~~~~~~~~~~~~~~")
print(pdf1)
# ~~~~~~~~~~~~~~~~~~打印pdf1~~~~~~~~~~~~~~~~~~
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
print("~~~~~~~~~~~~~~~~~~打印pdf2~~~~~~~~~~~~~~~~~~")
print(pdf2)
# ~~~~~~~~~~~~~~~~~~打印pdf2~~~~~~~~~~~~~~~~~~
# a b c d
# 0 1.0 1.0 1.0 1.0
# 1 1.0 1.0 1.0 1.0
# 2 1.0 1.0 1.0 1.0
print("~~~~~~~~~~~~~~~~~~打印pdf3~~~~~~~~~~~~~~~~~~")
print(pdf3)
# ~~~~~~~~~~~~~~~~~~打印pdf3~~~~~~~~~~~~~~~~~~
# a b c d
# 0 2.0 2.0 2.0 2.0
# 1 2.0 2.0 2.0 2.0
# 2 2.0 2.0 2.0 2.0
# concat合并
# 将axis=0,为纵向合并,累加行
# 但是他的index行索引会重复
print("~~~~~~~~~~~~~~~~~~打印res(三个矩阵以行合并)~~~~~~~~~~~~~~~~~~")
res = pd.concat([pdf1, pdf2, pdf3], axis=0)
print(res)
# ~~~~~~~~~~~~~~~~~~打印res(三个矩阵以行合并)~~~~~~~~~~~~~~~~~~
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 0 1.0 1.0 1.0 1.0
# 1 1.0 1.0 1.0 1.0
# 2 1.0 1.0 1.0 1.0
# 0 2.0 2.0 2.0 2.0
# 1 2.0 2.0 2.0 2.0
# 2 2.0 2.0 2.0 2.0
# index不重复的方法
res = pd.concat([pdf1, pdf2, pdf3], axis=0, ignore_index=True)
print("~~~~~~~~~~~~~~~~~~打印res(三个矩阵以行合并后index不重复)~~~~~~~~~~~~~~~~~~")
print(res)
# ~~~~~~~~~~~~~~~~~~打印res(三个矩阵以行合并后index不重复)~~~~~~~~~~~~~~~~~~
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 1.0
# 5 1.0 1.0 1.0 1.0
# 6 2.0 2.0 2.0 2.0
# 7 2.0 2.0 2.0 2.0
# 8 2.0 2.0 2.0 2.0
pd_1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3, ])
# a b c d
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 0.0 0.0 0.0 0.0
pd_2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])
# b c d e
# 2 1.0 1.0 1.0 1.0
# 3 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 1.0
'''
join='outer',函数默认为join='outer'。此方法是依照column来做纵向合并,有相同的column上下合并在一起,
其他独自的column各自成列,原来没有值的位置皆为NaN填充。
'''
res = pd.concat([pd_1, pd_2], axis=0, join='outer')
print('join合并后index并没有修改')
print(res)
# join合并后index并没有修改
# a b c d e
# 1 0.0 0.0 0.0 0.0 NaN
# 2 0.0 0.0 0.0 0.0 NaN
# 3 0.0 0.0 0.0 0.0 NaN
# 2 NaN 1.0 1.0 1.0 1.0
# 3 NaN 1.0 1.0 1.0 1.0
# 4 NaN 1.0 1.0 1.0 1.0
print('添加ignore_index后修改index')
# 通过将ignore_index选项设置为True清除现有索引并在结果中将其重置
res = pd.concat([pd_1, pd_2], axis=0, join='outer',ignore_index=True)
print(res)
# 添加ignore_index后修改index
# a b c d e
# 0 0.0 0.0 0.0 0.0 NaN
# 1 0.0 0.0 0.0 0.0 NaN
# 2 0.0 0.0 0.0 0.0 NaN
# 3 NaN 1.0 1.0 1.0 1.0
# 4 NaN 1.0 1.0 1.0 1.0
# 5 NaN 1.0 1.0 1.0 1.0
# join = 'inner'合并相同的列字段
# 纵向“内”合并pd_1和pd_2
res = pd.concat([pd_1, pd_2], axis=0, join='inner')
print('合并pd_1和pd_2相同的列头:')
print(res)
# b c d
# 1 0.0 0.0 0.0
# 2 0.0 0.0 0.0
# 3 0.0 0.0 0.0
# 2 1.0 1.0 1.0
# 3 1.0 1.0 1.0
# 4 1.0 1.0 1.0
pd_1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'], index=[1,2,3])
pd_2 = pd.DataFrame(np.ones((3, 4))*0, columns=['b', 'c', 'd', 'e'], index=[2,3,4])
print('依照pd_1.index进行横向合并')
# axis - 要连接的轴
# join – 如何处理其他轴上的索引
# join_axes – 用于其他 n - 1 轴的特定索引,而不是执行内部/外部集合逻辑
# ignore_index – 如果为 True,则不使用沿串联轴的索引值。
# 结果轴将被标记为 0, ..., n - 1。
# 如果您在连接轴没有有意义的索引信息的情况下连接对象,这将非常有用。
# 请注意其他轴上的索引值在连接中仍然有效。
# keys - 如果通过了多个级别,则应包含元组。 使用传递的键作为最外层构建分层索引
# levels – 用于构建 MultiIndex 的特定级别(唯一值)。 否则他们将从密钥中推断出来
# names - 生成的分层索引中级别的名称
# verify_integrity – 检查新的连接轴是否包含重复项。 相对于实际的数据连接,这可能非常昂贵
# sort – 如果在“join”为“outer”时尚未对齐,则对非串联轴进行排序。
# 当前默认的排序已被弃用,并将在未来版本的 Pandas 中更改为不排序。
# 显式传递 ``sort=True`` 以消除警告和排序。
# 显式传递 ``sort=False`` 以消除警告而不是排序。
# 这在 ``join='inner'`` 时不起作用,它已经保留了非串联轴的顺序。 .. 版本已添加:: 0.23.0
# copy – 如果为 False,则不要不必要地复制数据
res = pd.concat([pd_1, pd_2], axis=1, join_axes=[pd_1.index])
print(res)
# 依照pd_1.index进行横向合并
# a b c d b c d e
# 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
# 2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
# 3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
# append(添加数据)
# append只有纵向合并,没有横向合并
# 定义资料集
pd_1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'])
pd_2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd'])
pd_3 = pd.DataFrame(np.ones((3, 4)) * 2, columns=['a', 'b', 'c', 'd'])
s1 = pd.Series([1, 2, 3 ,4], index = ['a', 'b', 'c', 'd'])
# 将pd_2合并到pd_1下面,以及重置index,并打印结果
res = pd_1.append(pd_2, ignore_index=True)
print('将pd_2合并到pd_1下面,以及重置index:')
print(res)
# 将pd_2合并到pd_1下面,以及重置index:
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 1.0
# 5 1.0 1.0 1.0 1.0
res = pd_1.append([pd_2, pd_3], ignore_index=True)
print('将pd_2和pd_3合并到pd_1下面,以及重置index:')
print(res)
# 将pd_2和pd_3合并到pd_1下面,以及重置index:
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 1.0
# 5 1.0 1.0 1.0 1.0
# 6 2.0 2.0 2.0 2.0
# 7 2.0 2.0 2.0 2.0
# 8 2.0 2.0 2.0 2.0
print('两种不同的合并矩阵的方式:')
res = pd.concat([pd_1, pd_2, pd_3], axis=0, ignore_index=True)
res_1 = pd_1.append([pd_2, pd_3], ignore_index=True)
print('conact方法:')
print(res)
# conact方法:
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 1.0
# 5 1.0 1.0 1.0 1.0
# 6 2.0 2.0 2.0 2.0
# 7 2.0 2.0 2.0 2.0
# 8 2.0 2.0 2.0 2.0
print('append方法:')
print(res_1)
# append方法:
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 1.0
# 5 1.0 1.0 1.0 1.0
# 6 2.0 2.0 2.0 2.0
# 7 2.0 2.0 2.0 2.0
# 8 2.0 2.0 2.0 2.0
7.2Pandas 合并 merge
7.2.1定义资料集并打印出
left = pd.DataFrame({'key' : ['K0', 'K1', 'K2', 'K3'],
'A' : ['A0', 'A1', 'A2', 'A3'],
'B' : ['B0', 'B1', 'B2', 'B3']
})
print(left)
# key A B
# 0 K0 A0 B0
# 1 K1 A1 B1
# 2 K2 A2 B2
# 3 K3 A3 B3
right = pd.DataFrame({'key' : ['K0', 'K1', 'K2', 'K3'],
'C' : ['C0', 'C1', 'C2', 'C3'],
'D' : ['D0', 'D1', 'D2', 'D3']
})
print(right)
# key C D
# 0 K0 C0 D0
# 1 K1 C1 D1
# 2 K2 C2 D2
# 3 K3 C3 D3
7.2.2两列合并
# 依据key1与key2 columns进行合并,
# 并打印出四种结果['left', 'right', 'outer', 'inner']
pd_1 = pd.DataFrame({'lkey' : ['foo', 'bar', 'baz', 'foo'],
'value': [1, 2, 3, 5]})
print(pd_1)
# lkey value
# 0 foo 1
# 1 bar 2
# 2 baz 3
# 3 foo 5
pd_2 = pd.DataFrame({'rkey' : ['foo', 'bar', 'baz', 'foo'],
'value': [5, 6, 7, 8]})
print(pd_2)
# rkey value
# 0 foo 5
# 1 bar 6
# 2 baz 7
# 3 foo 8
# 将右边与左边进行并集操作
# foo(1, 5)\(1, 8)\(5, 5)\(5,8)
# bar(2,6)
# baz(3,7)
pd = pd_1.merge(pd_2, left_on='lkey', right_on='rkey')
print(pd)
# lkey value_x rkey value_y
# 0 foo 1 foo 5
# 1 foo 1 foo 8
# 2 foo 5 foo 5
# 3 foo 5 foo 8
# 4 bar 2 bar 6
# 5 baz 3 baz 7
# 会进行报错
# pd = pd_1.merge(pd_2, left_on='lkey', right_on='rkey', suffixes=(False, False))
# '{rename}'.format(rename=to_rename))
# ValueError: columns overlap but no suffix specified: Index(['value'], dtype='object')
# print(pd)
import pandas as pd
df_1 = pd.DataFrame({'a': ['foo', 'bar'],
'b': [1, 2]})
print(df_1)
# a b
# 0 foo 1
# 1 bar 2
df_2 = pd.DataFrame({'a': ['foo', 'baz'],
'c': [3, 4]})
print(df_2)
# a c
# 0 foo 3
# 1 baz 4
pd = df_1.merge(df_2, how='inner', on='a')
print(pd)
# a b c
# 0 foo 1 3
pd = df_1.merge(df_2, how='left', on='a')
print(pd)
# a b c
# 0 foo 1 3.0
# 1 bar 2 NaN
import pandas as pd
df1 = pd.DataFrame({'left': ['foo', 'bar']})
print(df1)
# left
# 0 foo
# 1 bar
df2 = pd.DataFrame({'right': [7, 8]})
print(df2)
# right
# 0 7
# 1 8
df = df1.merge(df2, how='cross')
print(df)
# left right
# 0 foo 7
# 1 foo 8
# 2 bar 7
# 3 bar 8
7.2.3Indicator设置合并列名称
pdf_1 = pd.DataFrame({'coll' : [0, 1],
'col_left' : ['a', 'b']})
print(pdf_1)
# coll col_left
# 0 0 a
# 1 1 b
pdf_2 = pd.DataFrame({'coll' : [1, 2, 2], 'col_right' : [2, 2, 2]})
print(pdf_2)
# coll col_right
# 0 1 2
# 1 2 2
# 2 2 2
# # 依据col1进行合并,并启用indicator=True,最后打印
# how:使用来自两个帧的键的联合,类似于 SQL 全外连接;按字典顺序对键进行排序。
res = pd.merge(pdf_1, pdf_2, on='coll', how='outer', indicator=True)
print(res)
# coll col_left col_right _merge
# 0 0 a NaN left_only
# 1 1 b 2.0 both
# 2 2 NaN 2.0 right_only
# 3 2 NaN 2.0 right_only
# 自定义indicator column的名称,并打印出
res = pd.merge(pdf_1, pdf_2, on='coll', how='outer', indicator='indicator_column')
print(res)
# coll col_left col_right indicator_column
# 0 0 a NaN left_only
# 1 1 b 2.0 both
# 2 2 NaN 2.0 right_only
# 3 2 NaN 2.0 right_only
7.2.4 依据 index 合并
# 依据index合并
# 定义资料集并打印出
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
print(left)
# A B
# K0 A0 B0
# K1 A1 B1
# K2 A2 B2
print(right)
# C D
# K0 C0 D0
# K2 C2 D2
# K3 C3 D3
# 依据左右资料集的index进行合并,how='outer',并打印
res = pd.merge(left,right,left_index=True,right_index=True,how='outer')
print(res)
# A B C D
# K0 A0 B0 C0 D0
# K1 A1 B1 NaN NaN
# K2 A2 B2 C2 D2
# K3 NaN NaN C3 D3
# 依据左右资料集的index进行合并,how='inner',并打印
res = pd.merge(left,right,left_index=True,right_index=True,how='inner')
print(res)
# A B C D
# K0 A0 B0 C0 D0
# K2 A2 B2 C2 D2
7.2.5 解决 overlapping 的问题
# 解决overlapping的问题
# 定义资料集
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
print(boys)
# k age
# 0 K0 1
# 1 K1 2
# 2 K2 3
print(girls)
# k age
# 0 K0 4
# 1 K0 5
# 2 K3 6
# 使用suffixes解决overlapping的问题
# 比如将上面两个合并时,age重复了,则可通过suffixes设置,以此保证不重复,不同名
res = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='inner')
print(res)
# k age_boy age_girl
# 0 K0 1 4
# 1 K0 1 5
7.2.6Indicator设置合并列名称
pdf_1 = pd.DataFrame({'coll' : [0, 1],
'col_left' : ['a', 'b']})
print(pdf_1)
# coll col_left
# 0 0 a
# 1 1 b
pdf_2 = pd.DataFrame({'coll' : [1, 2, 2], 'col_right' : [2, 2, 2]})
print(pdf_2)
# coll col_right
# 0 1 2
# 1 2 2
# 2 2 2
# # 依据col1进行合并,并启用indicator=True,最后打印
# how:使用来自两个帧的键的联合,类似于 SQL 全外连接;按字典顺序对键进行排序。
res = pd.merge(pdf_1, pdf_2, on='coll', how='outer', indicator=True)
print(res)
# coll col_left col_right _merge
# 0 0 a NaN left_only
# 1 1 b 2.0 both
# 2 2 NaN 2.0 right_only
# 3 2 NaN 2.0 right_only
# 自定义indicator column的名称,并打印出
res = pd.merge(pdf_1, pdf_2, on='coll', how='outer', indicator='indicator_column')
print(res)
# coll col_left col_right indicator_column
# 0 0 a NaN left_only
# 1 1 b 2.0 both
# 2 2 NaN 2.0 right_only
# 3 2 NaN 2.0 right_only
7.2.7 依据 index 合并
# 依据index合并
# 定义资料集并打印出
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
print(left)
# A B
# K0 A0 B0
# K1 A1 B1
# K2 A2 B2
print(right)
# C D
# K0 C0 D0
# K2 C2 D2
# K3 C3 D3
# 依据左右资料集的index进行合并,how='outer',并打印
res = pd.merge(left,right,left_index=True,right_index=True,how='outer')
print(res)
# A B C D
# K0 A0 B0 C0 D0
# K1 A1 B1 NaN NaN
# K2 A2 B2 C2 D2
# K3 NaN NaN C3 D3
# 依据左右资料集的index进行合并,how='inner',并打印
res = pd.merge(left,right,left_index=True,right_index=True,how='inner')
print(res)
# A B C D
# K0 A0 B0 C0 D0
# K2 A2 B2 C2 D2
7.2.8 解决 overlapping 的问题
# 解决overlapping的问题
# 定义资料集
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
print(boys)
# k age
# 0 K0 1
# 1 K1 2
# 2 K2 3
print(girls)
# k age
# 0 K0 4
# 1 K0 5
# 2 K3 6
# 使用suffixes解决overlapping的问题
# 比如将上面两个合并时,age重复了,则可通过suffixes设置,以此保证不重复,不同名
res = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='inner')
print(res)
# k age_boy age_girl
# 0 K0 1 4
# 1 K0 1 5
8.Pandas plot 出图
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
print(data)
print(data.cumsum())
# data本来就是一个数据,
data.plot()
plt.show()
# 随机生成1000行4列数据
arr = np.random.randn(1000, 4)
Hang = np.arange(1000)
colunm = list("ABCD")
data = pd.DataFrame(arr, index = Hang, columns=colunm)
data.cumsum()
data.plot()
plt.show()
# scatter
# y 与 x 的散点图,具有不同的标记大小和/或颜色。
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label='Class1')
data.plot.scatter(x='A', y='C', color='LightGreen', label='Class2', ax=ax)
plt.show()