详解 Pandas 的两大核心数据类型

文刀小桂

于 2024-08-28 19:44:47 发布

阅读量61

点赞数 1

分类专栏： Pandas 文章标签： pandas python 开发语言

本文链接：https://blog.csdn.net/weixin_44480009/article/details/141572921

版权

Pandas 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

一、Series

一维的带标签（索引）数组

1. 创建

import pandas as pd
import numpy as np

# 方式一:通过列表/数组创建
s1 = pd.Series(np.array([1, 2, 3, 4]))
print(type(s1))
print(s1)
print("*" * 20)
s2 = pd.Series([4, 5, 6, 7, 8], index=list("abcde"))
print(s2)

# 方式二:通过字典创建
s3 = pd.Series({
    "name": "张三",
    "age": 26,
    "job": "程序员"
})
print("*" * 20)
print(s3)

2. 属性

import pandas as pd

s3 = pd.Series({
    "name": "张三",
    "age": 26,
    "job": "程序员"
})

# Series 属性
# (1)values：数据值
print(s3.values)
print(type(s3.values))

# (2)index：索引
print(s3.index)
print(type(s3.index))
print(len(s3.index))
print(list(s3.index))

# (3)dtype：数据值类型
print(s3.dtype)

# (4)name
print(s3.name)

# (5)shape：形状（行和列）
print(s3.shape)

3. 索引和切片

import pandas as pd

# Series 的切片和索引
t = pd.Series({
    "id": 1001,
    "username": "李四",
    "slogan": "keep going"
})

t1 = pd.Series([4, 5, 6, 7, 9, 10])

# 索引:[]
# 直接索引
print(t["id"])

# 位置索引
# print(t[1]) # warning
print(t.iloc[2])

# 取多个索引
print(t[["id", "slogan"]])

# 布尔索引
print(t1[t1 < 8])

# 切片
print(t1[:3])

4. Series 常用方法

ndarray 的相关方法可以用于 Series

import pandas as pd

t1 = pd.Series([4, 5, 6, 7, 9, 10])

# 1.where():过滤筛选
print(t1.where(t1 > 6))

# 2.rank(method="dense|first|min|max", ascending=True|False):排序,返回排序号
# dense:相同值排名相同，如：1，2，3，3，4
# first:相同值排名按顺序排，如：1，2，3，4
# min:相同值排名取最小，如：1，2，3，3
# max:相同值排名取最大，如：1，2，4，4
# 无参默认，average:相同值排名取平均，如：1，2，3.5，3.5
print(t1.rank(method="dense", ascending=False))

二、DataFrame

二维数组，Series 的容器，有行索引（index, axis=0）和列索引（columns, axis=1）

1. 创建

import pandas as pd
import numpy as np

# 创建 DataFrame
# 方式一:使用ndarray创建
df = pd.DataFrame(np.arange(12).reshape((3,4)))
print(type(df))
print(df)

print("*" * 20)

# 方式二:使用字典创建
df2 = pd.DataFrame({
    "name": ["张三", "李四"],
    "age": [22, 25],
    "tel": ["10086", "10010"]
})

print(type(df2))
print(df2) # 每一行表示一条数据

2. 属性

import pandas as pd

df2 = pd.DataFrame({
    "name": ["张三", "李四"],
    "age": [22, 25],
    "tel": ["10086", "10010"]
})

# (1)index:行索引
print(df2.index) # RangeIndex(start=0, stop=2, step=1)

# 修改索引
df2.index = ["a", "b"]

# (2)columns:列索引
print(df2.columns) # Index(['name', 'age', 'tel'], dtype='object')

# (3)values:元素值数组，ndarray类型
print(df2.values)

# (4)dtypes:每一列的元素类型
print(df2.dtypes)

# (5)shape:形状
print(df2.shape)

# (6)ndim:数据维度
print(df2.ndim)

3. 常用方法

import numpy as np
import pandas as pd

t = pd.DataFrame(np.arange(100).reshape((20, 5)), columns=list("abcde"))

# (1)head():展示前几行，默认前5行
print(t.head())
print(t.head(2))

print("*" * 30)

# (2)tail():展示后几行，默认后5行
print(t.tail())
print(t.tail(3))

print("*" * 30)

# (3)info():展示 df 的概览
print(t.info())

print("*" * 30)

# (4)describe():展示df 的描述性统计信息
print(t.describe())

print("*" * 30)

# (5)sort_values()/sort_index():值排序/索引排序，默认升序
print(t.sort_values(by="a", ascending=False))

# (6)unique():返回去重后的数据值
print(t["a"].unique())

# (7)drop_duplicates(subset=[]):去除重复的值
# 不指定 subset 参数则删除全部列都相同的数据，可通过 subset 指定删除某列重复的数据
print(t.drop_duplicates())

# (8)重命名列名:rename(columns={oldName:newName}, inplace=True)
print(t.rename(columns={"a": "A"}))

# (9)删除索引或列:drop(columns=[]|index=[])
print(t.drop(columns=["b"]))

4. 索引切片

import numpy as np
import pandas as pd

t1 = pd.DataFrame(np.arange(12).reshape(3,4), index=list("abc"), columns=list("WXYZ"))
print(t1)
print("*" * 30)

# 方式一:直接使用[]取行或列
print(t1[:1]) # 数字取行
print(t1["X"]) # 字符串取列，返回 Series
print(t1[["X"]]) # 字符串取列，返回 DataFrame
print(t1[:2]["Y"]) # 取指定行和列

print("*" * 30)

# 方式二:loc[index, columns]，通过行索引名和列索引名取
print(t1.loc["b"])
print(t1.loc["a", "Y"])
print(t1.loc[:, "W"])
print(t1.loc["b", :])
print(t1.loc[["a", "c"], ["W", "Z"]])
print(t1.loc["a":, "X"])
print(t1.loc["a":"c", "Z"]) # 前闭后闭

print("*" * 30)

# 方式三:iloc[index, columns]，通过行号和列号取
print(t1.iloc[0])
print(t1.iloc[:, 1])
print(t1.iloc[2, 3])
print(t1.iloc[[0, 2], [2, 1]])
print(t1.iloc[:2, 1:3])
print(t1.iloc[0:2, 3]) # 前闭后开

print("*" * 30)

# 方式四:布尔索引
print(t1[t1["X"] > 5])
print(t1[(t1["X"] > 3) & (t1["Z"] < 8)])


# reindex([]):按索引取数据，不存在的返回 nan
print(t1.reindex(["a", "f"]))

# set_index(["column"], drop=True|False):将某列设置为索引
print(t1.set_index("W"))

# 索引可以重复
print(t1.index.unique())

5. 复合索引

MultiIndex

import numpy as np
import pandas as pd

a = pd.DataFrame({
    "a": np.arange(7),
    "b": np.arange(7, 0, -1),
    "c": ["one", "one", "one", "two", "two", "two", "two"],
    "d": list("hijklmn")
})

print(a)
print("*" * 20)

# 设置复合索引
b = a.set_index(["c", "d"])
print(b)
# print(type(b.index))
# print(b.index)

print("*" * 30)

# 复合索引取数
print(b["a"]["one"]["h"])
print(b["b"]["two"])
print(b.loc["one"].loc["h"])

print("*" * 30)

c = a.set_index(["d", "c"])
print(c)
print("*" * 30)

# 使用 swaplevel() 交换索引
print(c.swaplevel())
print(c.swaplevel()["a"]["one"])

文刀小桂

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
详解 Pandas 的两大核心数据类型

二维数组，Series 的容器，有行索引（index, axis=0）和列索引（columns, axis=1）ndarray 的相关方法可以用于 Series。一维的带标签（索引）数组。
复制链接

扫一扫

专栏目录