pandas三

weixin_42924890

已于 2024-04-29 16:10:16 修改

阅读量195

点赞数 2

分类专栏： pandas 文章标签： pandas python 开发语言

于 2024-04-28 23:58:04 首次发布

本文链接：https://blog.csdn.net/weixin_42924890/article/details/138288532

版权

pandas 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

pandas更新中…

pandas版本：2.0.3

pandas中的一些操作

导包

import pandas as pd
import numpy as np
from pandas import DataFrame, Series

merge操作

df1 = DataFrame({"key": ["X", "B", "Z"], "data_set_1": range(3)})
print(df1)
#   key  data_set_1
# 0   X           0
# 1   B           1
# 2   Z           2
df2 = DataFrame({"key": ["B", "A", "C"], "data_set_2": [4, 5, 6]})
print(df2)
#   key  data_set_2
# 0   B           4
# 1   A           5
# 2   C           6

# 同时保留2个DataFrame中相同列的数据
print(pd.merge(df1, df2))
#   key  data_set_1  data_set_2
# 0   B           1           4

# how="left" 以df1的"key"列作基准
# 将df2中"key"列的值合并到df1中，否则用NaN填充
print(pd.merge(df1, df2, on="key", how="left"))
#   key  data_set_1  data_set_2
# 0   X           0         NaN
# 1   B           1         4.0
# 2   Z           2         NaN

# 将df1和df2合并，并保留df1和df2中key列中不存在的行
print(pd.merge(df1, df2, on="key", how="outer"))
# key  data_set_1  data_set_2
# 0   X         0.0         NaN
# 1   B         1.0         4.0
# 2   Z         2.0         NaN
# 3   A         NaN         5.0
# 4   C         NaN         6.0

concat操作

Series的concat

s1 = Series([1, 2, 3], index=["a", "b", "c"])
print(s1)
# a    1
# b    2
# c    3
# dtype: int64
s2 = Series([4, 5, 6], index=["b", "c", "d"])
print(s2)
# b    4
# c    5
# d    6
# dtype: int64

print(pd.concat([s1, s2]))
# a    1
# b    2
# c    3
# b    4
# c    5
# d    6
# dtype: int64
print(pd.concat([s1, s2], axis=1))
#      0    1
# a  1.0  NaN
# b  2.0  4.0
# c  3.0  5.0
# d  NaN  6.0

DataFrame的concat

print(pd.concat([df1, df2]))
#   key  data_set_1  data_set_2
# 0   X         0.0         NaN
# 1   B         1.0         NaN
# 2   Z         2.0         NaN
# 0   B         NaN         4.0
# 1   A         NaN         5.0
# 2   C         NaN         6.0

print(pd.concat([df1, df2], axis=1))
#   key  data_set_1 key  data_set_2
# 0   X           0   B           4
# 1   B           1   A           5
# 2   Z           2   C           6

combine_first操作

df1 = DataFrame(
    {
        "X": [1, np.nan, 3, np.nan],
        "Y": [5, np.nan, 7, np.nan],
        "Z": [9, np.nan, 11, np.nan],
    }
)
print(df1)
#      X    Y     Z
# 0  1.0  5.0   9.0
# 1  NaN  NaN   NaN
# 2  3.0  7.0  11.0
# 3  NaN  NaN   NaN
df2 = DataFrame(
    {
        "Z": [999, 10, np.nan, 12],
        "A": [1, 2, 3, 4],
    }
)
print(df2)
#        Z  A
# 0  999.0  1
# 1   10.0  2
# 2    NaN  3
# 3   12.0  4

# 以df1为基准，用df2填充到df1中的nan值
print(df1.combine_first(df2))
#    A    X    Y     Z
# 0  1  1.0  5.0   9.0
# 1  2  NaN  NaN  10.0
# 2  3  3.0  7.0  11.0
# 3  4  NaN  NaN  12.0
# 以df2为基准，用df1填充到df2中的nan值
print(df2.combine_first(df1))
#    A    X    Y      Z
# 0  1  1.0  5.0  999.0
# 1  2  NaN  NaN   10.0
# 2  3  3.0  7.0   11.0
# 3  4  NaN  NaN   12.0

apply操作

data = [
    ["Symbol: APPL Seqno: 0 Price: 100"] * 50,
    ["Symbol: APPL Seqno: 1 Price: 100"] * 50,
]
data = sum(data, [])  # 将二维list拆成一维list
df_ = DataFrame({"time": np.arange(100), "data": data})
print(df_.head())
#    time                              data
# 0     0  Symbol: APPL Seqno: 0 Price: 100
# 1     1  Symbol: APPL Seqno: 0 Price: 100
# 2     2  Symbol: APPL Seqno: 0 Price: 100
# 3     3  Symbol: APPL Seqno: 0 Price: 100
# 4     4  Symbol: APPL Seqno: 0 Price: 100

# 将Symbol: APPL Seqno: 1 Price: 100拆成3列
def foo(line):
    items = line.strip().split(" ")
    return Series([items[1], items[3], items[5]])

df_temp = df_["data"].apply(foo)
df_temp.rename(columns={0: "Symbol", 1: "Seqno", 2: "Price"}, inplace=True)
print(df_temp.head())
#   Symbol Seqno Price
# 0   APPL     0   100
# 1   APPL     0   100
# 2   APPL     0   100
# 3   APPL     0   100
# 4   APPL     0   100
    
df_new = df_.combine_first(df_temp)
print(df_new.head())
#   Price Seqno Symbol                              data  time
# 0   100     0   APPL  Symbol: APPL Seqno: 0 Price: 100     0
# 1   100     0   APPL  Symbol: APPL Seqno: 0 Price: 100     1
# 2   100     0   APPL  Symbol: APPL Seqno: 0 Price: 100     2
# 3   100     0   APPL  Symbol: APPL Seqno: 0 Price: 100     3
# 4   100     0   APPL  Symbol: APPL Seqno: 0 Price: 100     4
del df_new["data"]
print(df_new.head())
# Price Seqno Symbol  time
# 0   100     0   APPL     0
# 1   100     0   APPL     1
# 2   100     0   APPL     2
# 3   100     0   APPL     3
# 4   100     0   APPL     4

df_new_ = df_new.drop_duplicates(["Seqno"])  # 根据Seqno列去重
print(df_new_)
#    Price Seqno Symbol  time
# 0    100     0   APPL     0
# 50   100     1   APPL    50

分箱技术

将一个list数值按某一个数值段进行切分统计，将[25,100]中的20个数据，按照(0, 59]、(59, 70]、(70, 80]、(80, 100]几个区间进行统计。

np.random.seed(0)
score_list = np.random.randint(25, 100, 20)
bins = [0, 59, 70, 80, 100]
score_cut = pd.cut(score_list, bins)
pd.value_counts(score_cut)
# (80, 100]    8
# (0, 59]      6
# (59, 70]     4
# (70, 80]     2
# Name: count, dtype: int64

DataFrame的分箱操作

df = DataFrame()
np.random.seed(0)
score_list = np.random.randint(25, 100, 20)
df["score"] = score_list
df["student"] = list("abcdefghijklmnopqrst")
# 拆箱操作，并将结果修改为labels里面的值，为df增加新的列
df["Categories"] = pd.cut(df["score"], bins, labels=["Low", "Ok", "Good", "Great"])
print(df.head())
#    score student Categories
# 0     69       a         Ok
# 1     72       b       Good
# 2     89       c      Great
# 3     92       d      Great
# 4     92       e      Great

gropby 对数据进行分组

groupby = split + apply + combine

df = pd.read_csv("test_data.csv")
print(df)  # 显示数据
#           date city  temperature   wind
# 0   03/01/2016   BJ            8      5
# 1   17/01/2016   BJ           12      2
# 2   31/01/2016   BJ           19      2
# 3   14/02/2016   BJ           -3      3
# 4   28/02/2016   BJ           19      2
# 5   13/03/2016   BJ            5      3
# 6   27/03/2016   SH           -4      4
# 7   10/04/2016   SH           19      3
# 8   24/04/2016   SH           20      3
# 9   08/05/2016   SH           17      3
# 10  22/05/2016   SH            4      2
# 11  05/06/2016   SH          -10      4
# 12  19/06/2016   SH            0      5
# 13  03/07/2016   SH           -9      5
# 14  17/07/2016   GZ           10      2
# 15  31/07/2016   GZ           -1      5
# 16  14/08/2016   GZ            1      5
# 17  28/08/2016   GZ           25      4
# 18  11/09/2016   SZ           20      1
# 19  25/09/2016   SZ          -10      4
g = df.groupby(df["city"])
print(g.groups)
# BJ是一个组，[0, 1, 2, 3, 4, 5]是原始数据的index值
# {'BJ': [0, 1, 2, 3, 4, 5], 'GZ': [14, 15, 16, 17], 'SH': [6, 7, 8, 9, 10, 11, 12, 13], 'SZ': [18, 19]}

# 获取分组后的数据
df_bj = g.get_group("BJ")
print(df_bj.head())
#  date city  temperature  wind
# 0  03/01/2016   BJ            8     5
# 1  17/01/2016   BJ           12     2
# 2  31/01/2016   BJ           19     2
# 3  14/02/2016   BJ           -3     3
# 4  28/02/2016   BJ           19     2

# 对某一组进行数值分析
print(df_bj.mean(numeric_only=True))
# temperature    10.000000
# wind            2.833333
# dtype: float64

对分组后所有数据分析

# 对所有分组数据进行分析
print(g.mean(numeric_only=True))
#       temperature      wind
# city
# BJ         10.000  2.833333
# GZ          8.750  4.000000
# SH          4.625  3.625000
# SZ          5.000  2.500000
print(g.max(numeric_only=True))
#       temperature  wind
# city
# BJ             19     5
# GZ             25     5
# SH             20     5
# SZ             20     4

分组后聚合操作

# 每列的最大值减去该列的最小值
def foo(attr):
    return attr.max(numeric_only=True) - attr.min(numeric_only=True)


df_group = g[["temperature", "wind"]].agg(foo)
print(df_group)
#       temperature  wind
# city
# BJ             22     3
# GZ             26     3
# SH             30     3
# SZ             30     3

对某两个column进行分组，获取值的时候需要传入tuple

g_new = df.groupby(["city", "wind"])
print(g_new.groups)
# # BJ和2为一组，其数据有[1, 2, 4]为index表示
# {('BJ', 2): [1, 2, 4], ('BJ', 3): [3, 5],...}

# 获取元素
print(g_new.get_group(("BJ", 2)))  # DataFrame
#          date city  temperature  wind
# 1  17/01/2016   BJ           12     2
# 2  31/01/2016   BJ           19     2
# 4  28/02/2016   BJ           19     2

# 循环遍历
for item in g_new:
    print(item[0])  # ('BJ', 2)
    print(type(item[0]))  # <class 'tuple'>
    print(item[1])
    #          date city  temperature  wind
    # 1  17/01/2016   BJ           12     2
    # 2  31/01/2016   BJ           19     2
    # 4  28/02/2016   BJ           19     2
    print(type(item[1]))  # <class 'pandas.core.frame.DataFrame'>
    break