Pandas是基于Numpy创建的Python库,为Python提供易于使用的数据结构和数据分析工具。
使用以下数据导入Pandas库:
import pandas as pd
数据结构
- Series序列
存储任意类型数据的一维数组
s = pd.Series([2, -5, 8, 3], index=['a','b','c','d'])
print(s)
# a 2
# b -5
# c 8
# d 3
# dtype: int64
# 取序列的值
print(s["b"])
# -5
- DataFrame数据框
data = {"Country":['Belgium', 'India', 'Brazil'],
"Capital":['Brussels', 'New Delhi', 'Brasília'],
'Population': [11190846, 1303171035, 207847528]}
df = pd.DataFrame(data, columns=["Country", "Captial", "Population"])
# 如果执行df = pd.DataFrame(data),结果也一样,默认使用"Country", "Captial", "Population"
print(df)
# Country Capital Population
# 0 Belgium Brussels 11190846
# 1 India New Delhi 1303171035
# 2 Brazil Brasília 207847528
# 取数据框的子集
print(df[1:])
# Country Capital Population
# 1 India New Delhi 1303171035
# 2 Brazil Brasília 207847528
# 如果print(df[1])则会报错
- 选取、布尔索引以及设置值
# 按位置, 按行与列的位置选择某值
print(df.iloc[0,0])
# Belgium
print(df.iloc[[0],[0]])
# Country
# 0 Belgium
print(df.iat[0,0])
# Belgium
# 按标签,按行与列的名称选择某值
print(df.loc[[0],["Country"]])
# Country
# 0 Belgium
print(df.loc[0,"Country"])
# Belgium
# 布尔索引
# 序列 S 中小于1的值
print(s[~(s>1)]) # print(s[s<1])
# b -5
# dtype: int64
# 序列 S 中小于-1或大于2的值
print(s[(s < -1) | (s > 2)])
# a 3
# b -5
# c 7
# d 4
# dtype: int64
# 使用筛选器调整数据框
df[df['Population']>1200000000]
# Country Capital Population
# 1 India New Delhi 1303171035、
# 设置值
s['a']=6 # 将序列 S 中索引为 a 的值设为6
print(s)
# a 6
# b -5
# c 7
# d 4
# dtype: int64
- 删除数据
print(s.drop(['a','c'])) # 按索引删除序列的值(axis=0)
# b -5
# d 4
# dtype: int64
print(df.drop("Country",axis=1)) # 按列名删除数据框的列(axis=1)
# Capital Population
# 0 Brussels 11190846
# 1 New Delhi 1303171035
# 2 Brasília 207847528
- 调用帮助
print(help(pd.Series.loc))
- 排序和排名
print(df.sort_index()) # 按索引排序
# Country Capital Population
# 0 Belgium Brussels 11190846
# 1 India New Delhi 1303171035
# 2 Brazil Brasília 207847528
print(df.sort_values(by="Country")) # 按某列的值排序
# Country Capital Population
# 0 Belgium Brussels 11190846
# 2 Brazil Brasília 207847528
# 1 India New Delhi 1303171035
print(df.rank()) # 数据框排名
# Country Capital Population
# 0 1.0 2.0 1.0
# 1 3.0 3.0 3.0
# 2 2.0 1.0 2.0
- 查询序列与数据框的信息
(a) 基本信息
# (行,列)
print(df.shape)
# (3, 3)
# 获取索引
print(df.index)
# RangeIndex(start=0, stop=3, step=1)
# 获取列名
print(df.columns)
# Index(['Country', 'Capital', 'Population'], dtype='object')
# 获取数据框基本信息
print(df.info())
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 3 entries, 0 to 2
# Data columns (total 3 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 Country 3 non-null object
# 1 Capital 3 non-null object
# 2 Population 3 non-null int64
# dtypes: int64(1), object(2)
# memory usage: 200.0+ bytes
# None
# 非Na值的数量
df.count()
# Country 3
# Capital 3
# Population 3
# dtype: int64
(b) 汇总
# 合计
print(df.sum())
# Country BelgiumIndiaBrazil
# Capital BrusselsNew DelhiBrasília
# Population 1522209409
# dtype: object
# 累计
print(df.cumsum())
# Country Capital Population
# 0 Belgium Brussels 11190846
# 1 BelgiumIndia BrusselsNew Delhi 1314361881
# 2 BelgiumIndiaBrazil BrusselsNew DelhiBrasília 1522209409
# 最大值除以最小值
print(s.min()/s.max())
# -0.7142857142857143
data = {"gini":[0.4, 0.5, 0.6],
'Population': [11190846, 1303171035, 207847528]}
df2 = pd.DataFrame(data)
print(df2.min()/df2.max())
# gini 0.666667
# Population 0.008587
# dtype: float64
# 索引最小值除以索引最大值
df2.idxmin()/df2.idxmax()
# gini 0.0
# Population 0.0
# dtype: float64
# 基础统计数据
df2.describe()
# gini Population
# count 3.00 3.000000e+00
# mean 0.50 5.074031e+08
# std 0.10 6.961346e+08
# min 0.40 1.119085e+07
# 25% 0.45 1.095192e+08
# 50% 0.50 2.078475e+08
# 75% 0.55 7.555093e+08
# max 0.60 1.303171e+09
# 平均值
print(df2.mean())
# gini 5.000000e-01
# Population 5.074031e+08
# dtype: float64
# 中位数
print(df2.median())
# gini 0.5
# Population 207847528.0
# dtype: float64
- 应用函数
# 应用匿名函数lambda
f = lambda x:x*2
# 应用函数
print(df2.apply(f))
# gini Population
# 0 0.8 22381692
# 1 1.0 2606342070
# 2 1.2 415695056
# 对每个单元格应用函数
df2.applymap(f)
# gini Population
# 0 0.8 22381692
# 1 1.0 2606342070
# 2 1.2 415695056
- 数据对齐
(1)内部数据对齐
如有不一致的索引,则使用NA值:
print(s)
# a 6
# b -5
# c 7
# d 4
# dtype: int64
s3=pd.Series([7,-2,3], index=["a","c","d"])
print(s+s3)
# a 13.0
# b NaN
# c 5.0
# d 7.0
# dtype: float64
(2)使用Fill方法运算
还可以使用Fill方法进行内部对齐运算
print(s.add(s3,fill_value=0))
# a 13.0
# b -5.0
# c 5.0
# d 7.0
# dtype: float64
print(s.sub(s3,fill_value=2))
# a -1.0
# b -7.0
# c 9.0
# d 1.0
# dtype: float64
print(s.div(s3,fill_value=4))
# a 0.857143
# b -1.250000
# c -3.500000
# d 1.333333
# dtype: float64
- 输入和输出
(1)读取写入CSV
默认的将第一行当作列名
# 给 pd.read_csv() 加上 header=None 即可取消将第一行当作列名的默认值
print(pd.read_csv(r"D:\code\binary_classify\data\train_sub_labels.csv", header=None, nrows=5))
# 0 1
# 0 id label
# 1 f38a6374c348f90b587e046aac6079959adf3835 0
# 2 c18f2d887b7ae4f6742ee445113fa1aef383ed77 1
# 3 755db6279dae599ebb4d39a9123cce439965282d 0
# 4 bc3f0c64fb968ff4a8bd33af6971ecae77c75e08 0
# print(df.to_csv("myDataFrame.csv")
df = pd.read_csv(r"D:\code\binary_classify\data\train_sub_labels.csv", nrows=4) # 读取CSV
print(df)
# id label
# 0 f38a6374c348f90b587e046aac6079959adf3835 0
# 1 c18f2d887b7ae4f6742ee445113fa1aef383ed77 1
# 2 755db6279dae599ebb4d39a9123cce439965282d 0
# 3 bc3f0c64fb968ff4a8bd33af6971ecae77c75e08 0
df.to_csv("myDataFrame.csv") # 写入CSV
(2)读取写入Excel
df = pd.read_excel(r"D:\code\python_for_microscopists\other_files\K_Means.xlsx")
print(df)
# X Y
# 0 1 42
# 1 2 46
# .. ... ...
# 148 149 61
# [149 rows x 2 columns]
df.to_excel("myDataFrame.xlsx",sheet_name="Sheet1")
# 读取内含多个表的Excel
xlsx=pd.ExcelFile("file.xls")
df=pd.read_excel(xlsx, "Sheet1")