Pandas基础知识(1)

最新推荐文章于 2023-10-01 16:42:51 发布

求则得之，舍则失之

最新推荐文章于 2023-10-01 16:42:51 发布

阅读量111

点赞数

分类专栏： Pandas 必备知识文章标签： pandas

本文链接：https://blog.csdn.net/weixin_43229348/article/details/119171772

版权

必备知识同时被 2 个专栏收录

32 篇文章 2 订阅

订阅专栏

Pandas

1 篇文章 0 订阅

订阅专栏

Pandas是基于Numpy创建的Python库，为Python提供易于使用的数据结构和数据分析工具。
使用以下数据导入Pandas库：

import pandas as pd

数据结构

Series序列
存储任意类型数据的一维数组

s = pd.Series([2, -5, 8, 3], index=['a','b','c','d'])
print(s)
# a    2
# b   -5
# c    8
# d    3
# dtype: int64
# 取序列的值
print(s["b"])
# -5

DataFrame数据框

data = {"Country":['Belgium', 'India', 'Brazil'],
		"Capital":['Brussels', 'New Delhi', 'Brasília'],
		'Population': [11190846, 1303171035, 207847528]}
df = pd.DataFrame(data, columns=["Country", "Captial", "Population"])
# 如果执行df = pd.DataFrame(data)，结果也一样，默认使用"Country", "Captial", "Population"
print(df)
#    Country    Capital  Population
# 0  Belgium   Brussels    11190846
# 1    India  New Delhi  1303171035
# 2   Brazil   Brasília   207847528

# 取数据框的子集
print(df[1:])
#    Country    Capital  Population
# 1   India  New Delhi  1303171035
# 2  Brazil   Brasília   207847528
# 如果print(df[1])则会报错

选取、布尔索引以及设置值

# 按位置, 按行与列的位置选择某值
print(df.iloc[0,0])  
# Belgium
print(df.iloc[[0],[0]]) 
#     Country
# 0  Belgium
print(df.iat[0,0])
# Belgium

# 按标签，按行与列的名称选择某值
print(df.loc[[0],["Country"]])
#     Country
# 0  Belgium
print(df.loc[0,"Country"])
# Belgium

# 布尔索引
# 序列 S 中小于1的值
print(s[~(s>1)])  # print(s[s<1])
# b   -5
# dtype: int64

# 序列 S 中小于-1或大于2的值
print(s[(s < -1) | (s > 2)])
# a    3
# b   -5
# c    7
# d    4
# dtype: int64

# 使用筛选器调整数据框
df[df['Population']>1200000000]
#    Country    Capital  Population
# 1   India  New Delhi  1303171035、

# 设置值
s['a']=6  # 将序列 S 中索引为 a 的值设为6
print(s)
# a    6
# b   -5
# c    7
# d    4
# dtype: int64

删除数据

print(s.drop(['a','c']))  # 按索引删除序列的值(axis=0)
# b   -5
# d    4
# dtype: int64

print(df.drop("Country",axis=1))  # 按列名删除数据框的列(axis=1)
#      Capital  Population
# 0   Brussels    11190846
# 1  New Delhi  1303171035
# 2   Brasília   207847528

调用帮助

print(help(pd.Series.loc))

排序和排名

print(df.sort_index()）  # 按索引排序
#     Country    Capital  Population
# 0  Belgium   Brussels    11190846
# 1    India  New Delhi  1303171035
# 2   Brazil   Brasília   207847528

print(df.sort_values(by="Country"))  # 按某列的值排序
#    Country    Capital  Population
# 0  Belgium   Brussels    11190846
# 2   Brazil   Brasília   207847528
# 1    India  New Delhi  1303171035

print(df.rank())  # 数据框排名
#     Country  Capital  Population
# 0      1.0      2.0         1.0
# 1      3.0      3.0         3.0
# 2      2.0      1.0         2.0

查询序列与数据框的信息

(a) 基本信息

# （行，列）
print(df.shape)
# (3, 3)

# 获取索引
print(df.index)
# RangeIndex(start=0, stop=3, step=1)

# 获取列名
print(df.columns)
# Index(['Country', 'Capital', 'Population'], dtype='object')

# 获取数据框基本信息
print(df.info())
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 3 entries, 0 to 2
# Data columns (total 3 columns):
#  #   Column      Non-Null Count  Dtype
# ---  ------      --------------  -----
# 0   Country     3 non-null      object
# 1   Capital     3 non-null      object
# 2   Population  3 non-null      int64
# dtypes: int64(1), object(2)
# memory usage: 200.0+ bytes
# None

# 非Na值的数量
df.count()
# Country       3
# Capital       3
# Population    3
# dtype: int64

(b) 汇总

# 合计
print(df.sum())
# Country              BelgiumIndiaBrazil
# Capital       BrusselsNew DelhiBrasília
# Population                   1522209409
# dtype: object

# 累计
print(df.cumsum())
#               Country                    Capital  Population
# 0             Belgium                   Brussels    11190846
# 1        BelgiumIndia          BrusselsNew Delhi  1314361881
# 2  BelgiumIndiaBrazil  BrusselsNew DelhiBrasília  1522209409

# 最大值除以最小值
print(s.min()/s.max())
# -0.7142857142857143

data = {"gini":[0.4, 0.5, 0.6],
		'Population': [11190846, 1303171035, 207847528]}
df2 = pd.DataFrame(data)
print(df2.min()/df2.max())
# gini          0.666667
# Population    0.008587
# dtype: float64

# 索引最小值除以索引最大值
df2.idxmin()/df2.idxmax()
# gini          0.0
# Population    0.0
# dtype: float64

# 基础统计数据
df2.describe()
#        gini    Population
# count  3.00  3.000000e+00
# mean   0.50  5.074031e+08
# std    0.10  6.961346e+08
# min    0.40  1.119085e+07
# 25%    0.45  1.095192e+08
# 50%    0.50  2.078475e+08
# 75%    0.55  7.555093e+08
# max    0.60  1.303171e+09

# 平均值
print(df2.mean())
# gini          5.000000e-01
# Population    5.074031e+08
# dtype: float64

# 中位数
print(df2.median())
# gini                  0.5
# Population    207847528.0
# dtype: float64

应用函数

# 应用匿名函数lambda
f = lambda x:x*2

# 应用函数
print(df2.apply(f))
#    gini  Population
# 0   0.8    22381692
# 1   1.0  2606342070
# 2   1.2   415695056

# 对每个单元格应用函数
df2.applymap(f)
#    gini  Population
# 0   0.8    22381692
# 1   1.0  2606342070
# 2   1.2   415695056

数据对齐
（1）内部数据对齐
如有不一致的索引，则使用NA值：

print(s)
# a    6
# b   -5
# c    7
# d    4
# dtype: int64
s3=pd.Series([7,-2,3], index=["a","c","d"])
print(s+s3)
# a    13.0
# b     NaN
# c     5.0
# d     7.0
# dtype: float64

（2）使用Fill方法运算
还可以使用Fill方法进行内部对齐运算

print(s.add(s3,fill_value=0))
# a    13.0
# b    -5.0
# c     5.0
# d     7.0
# dtype: float64

print(s.sub(s3,fill_value=2))
# a   -1.0
# b   -7.0
# c    9.0
# d    1.0
# dtype: float64

print(s.div(s3,fill_value=4))
# a    0.857143
# b   -1.250000
# c   -3.500000
# d    1.333333
# dtype: float64

输入和输出
（1）读取写入CSV
默认的将第一行当作列名

# 给 pd.read_csv() 加上 header=None 即可取消将第一行当作列名的默认值
print(pd.read_csv(r"D:\code\binary_classify\data\train_sub_labels.csv", header=None, nrows=5))
#                                           0      1
# 0                                        id  label
# 1  f38a6374c348f90b587e046aac6079959adf3835      0 
# 2  c18f2d887b7ae4f6742ee445113fa1aef383ed77      1
# 3  755db6279dae599ebb4d39a9123cce439965282d      0
# 4  bc3f0c64fb968ff4a8bd33af6971ecae77c75e08      0
# print(df.to_csv("myDataFrame.csv")
df = pd.read_csv(r"D:\code\binary_classify\data\train_sub_labels.csv", nrows=4)  # 读取CSV
print(df)
#                                         id  label
# 0  f38a6374c348f90b587e046aac6079959adf3835      0 
# 1  c18f2d887b7ae4f6742ee445113fa1aef383ed77      1
# 2  755db6279dae599ebb4d39a9123cce439965282d      0
# 3  bc3f0c64fb968ff4a8bd33af6971ecae77c75e08      0
df.to_csv("myDataFrame.csv") # 写入CSV

(2）读取写入Excel

df = pd.read_excel(r"D:\code\python_for_microscopists\other_files\K_Means.xlsx")
print(df)
#        X    Y
# 0      1   42
# 1      2   46
# ..   ...  ...
# 148  149   61
# [149 rows x 2 columns]
df.to_excel("myDataFrame.xlsx",sheet_name="Sheet1")
# 读取内含多个表的Excel
xlsx=pd.ExcelFile("file.xls")
df=pd.read_excel(xlsx, "Sheet1")

求则得之，舍则失之

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Pandas基础知识(1)

Pandas是基于Numpy创建的Python库，为Python提供易于使用的数据结构和数据分析工具。使用以下数据导入Pandas库：import pandas as pd数据结构Series序列存储任意类型数据的一维数组s = pd.Series([2, -5, 8, 3], index=['a','b','c','d'])print(s)# a 2# b -5# c 8# d 3# dtype: int64# 取序列的值print(s["b"])
复制链接

扫一扫

专栏目录