Python基础48：pandas模块的介绍

最新推荐文章于 2022-12-07 14:33:08 发布

xiaowei0548

最新推荐文章于 2022-12-07 14:33:08 发布

阅读量227

点赞数

分类专栏： Python基础文章标签： pandas

本文链接：https://blog.csdn.net/xiaowei0548/article/details/117677650

版权

Python基础专栏收录该内容

55 篇文章 2 订阅

订阅专栏

# **************************************************************************
# Python学习
# **************************************************************************
# ** 所属主题: 模块
# ** 所属分层: 48 pandas模块的介绍
# ** 功能描述: 48 pandas模块的介绍
# ** 创 建 者: 陈红伟
# ** 创建日期: 2021/4/22 11:55 下午
# **************************************************************************
# ** 修改日期     修改人    修改内容
# ** 2021/4/22   陈红伟    新增学习内容代码
# **************************************************************************
import pandas as pd
import numpy as np
import os
excel_path = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))+'/test/vipthink/fa/订单课耗2018数据2021-04-23.xlsx'

"""
pandas中Series序列：
"""
# =======================Series=======================
# 一、生成Series序列：
#   1】、创建Series序列方法1：
# s = pd.Series(1,2,3,np.nan,'2021-04-24',True) # TypeError: object of type 'int' has no len()
s = pd.Series([1,2,3,np.nan,'2021-04-24',True]) # 将Series的参数改为列表类型就OK了
print(s)
# 0             1
# 1             2
# 2             3
# 3           NaN
# 4    2021-04-24
# 5          True
# dtype: object


#   2】、创建Series序列方法2：
indexs = pd.date_range('2021-04-01',periods=6)
print(indexs)
# DatetimeIndex(['2021-04-01', '2021-04-02', '2021-04-03', '2021-04-04',
#                '2021-04-05', '2021-04-06'],
#               dtype='datetime64[ns]', freq='D')



"""
pandas中DataFrame：
"""
# =======================DataFrame=======================
# 一、创建DataFrame
#   1】、创建DataFrame方法1：
df1 = pd.DataFrame(data=np.arange(24).reshape(6,4),index=indexs,columns=['A','B','C','D'])
print(df1)
#              A   B   C   D
# 2021-04-01   0   1   2   3
# 2021-04-02   4   5   6   7
# 2021-04-03   8   9  10  11
# 2021-04-04  12  13  14  15
# 2021-04-05  16  17  18  19
# 2021-04-06  20  21  22  23


#   2】、创建DataFrame方法2：
df2 = pd.DataFrame(np.arange(24).reshape(6,4))
print(df2)
#     0   1   2   3
# 0   0   1   2   3
# 1   4   5   6   7
# 2   8   9  10  11
# 3  12  13  14  15
# 4  16  17  18  19
# 5  20  21  22  23


#   3】、创建DataFrame方法3：
df3 = pd.read_excel(excel_path)
print(df3)
#     tttt    id                 name  ...  累计值                etl时间  是否是测试数据
# 0    111  2663  chw6523.41027278453  ...    2  2021-04-23 18:48:30        0
# 1    111  5174  chw8601.19119286537  ...    2  2021-04-23 18:48:30        0
# 2    111  7482   chw2497.0723176375  ...    2  2021-04-23 18:48:30        0
# 3    111  6987  lkp3488.16597834229  ...    2  2021-04-23 18:48:30        0
# 4    111  2941  lkp8045.83088494837  ...    2  2021-04-23 18:48:30        0
# [4 rows x 11 columns]



#   4】、创建DataFrame方法4：
df4 = pd.DataFrame({
    'A':1,
    'B':pd.Timestamp(20210424),
    'C':np.array([3,4,5,6],dtype=int),
    'D':["z","y","x","p"],
    'E':1,
})
print(df4)
#    A                             B  C  D  E
# 0  1 1970-01-01 00:00:00.020210424  3  z  1
# 1  1 1970-01-01 00:00:00.020210424  4  y  1
# 2  1 1970-01-01 00:00:00.020210424  5  x  1
# 3  1 1970-01-01 00:00:00.020210424  6  p  1


# 二、DataFrame的属性

# 打印原来的DateFrame
print(df1)
#              A   B   C   D
# 2021-04-01   0   1   2   3
# 2021-04-02   4   5   6   7
# 2021-04-03   8   9  10  11
# 2021-04-04  12  13  14  15
# 2021-04-05  16  17  18  19
# 2021-04-06  20  21  22  23

# 将B列的值*10
df1.B = df1.B.apply(lambda value:value*10)
print(df1)
#              A    B   C   D
# 2021-04-01   0   10   2   3
# 2021-04-02   4   50   6   7
# 2021-04-03   8   90  10  11
# 2021-04-04  12  130  14  15
# 2021-04-05  16  170  18  19
# 2021-04-06  20  210  22  23


# 2.1 查看索引index
print(df1.index)
# DatetimeIndex(['2021-04-01', '2021-04-02', '2021-04-03', '2021-04-04',
#                '2021-04-05', '2021-04-06'],
#               dtype='datetime64[ns]', freq='D')

# 2.2 查看列名columns
print(df1.columns)
# Index(['A', 'B', 'C', 'D'], dtype='object')

# 2.3 查看具体的值
print(df1.values)
# [[  0  10   2   3]
#  [  4  50   6   7]
#  [  8  90  10  11]
#  [ 12 130  14  15]
#  [ 16 170  18  19]
#  [ 20 210  22  23]]

# 2.4 DataFrame的描述，只能针对类型是数字类型的列
print(df1.describe())
#                A           B          C          D
# count   6.000000    6.000000   6.000000   6.000000
# mean   10.000000  110.000000  12.000000  13.000000
# std     7.483315   74.833148   7.483315   7.483315
# min     0.000000   10.000000   2.000000   3.000000
# 25%     5.000000   60.000000   7.000000   8.000000
# 50%    10.000000  110.000000  12.000000  13.000000
# 75%    15.000000  160.000000  17.000000  18.000000
# max    20.000000  210.000000  22.000000  23.000000

# 2.5 排序
# 2.5.1 遍历行，按照列名 倒序(ascending=False)排序
print(df1.sort_index(axis=1,ascending=False))
#              D   C    B   A
# 2021-04-01   3   2   10   0
# 2021-04-02   7   6   50   4
# 2021-04-03  11  10   90   8
# 2021-04-04  15  14  130  12
# 2021-04-05  19  18  170  16
# 2021-04-06  23  22  210  20
# 列明倒序了

# 2.5.2 遍历列，按照索引名 倒序(ascending=False)排序
print(df1.sort_index(axis=0,ascending=False))
#              A    B   C   D
# 2021-04-06  20  210  22  23
# 2021-04-05  16  170  18  19
# 2021-04-04  12  130  14  15
# 2021-04-03   8   90  10  11
# 2021-04-02   4   50   6   7
# 2021-04-01   0   10   2   3
# 索引倒序了

# 2.5.3 遍历行，按照列值 倒序(ascending=False)排序
print(df1.sort_values(by='2021-04-04',axis=1,ascending=False))
#               B   D   C   A
# 2021-04-01   10   3   2   0
# 2021-04-02   50   7   6   4
# 2021-04-03   90  11  10   8
# 2021-04-04  130  15  14  12
# 2021-04-05  170  19  18  16
# 2021-04-06  210  23  22  20


# 三、DataFrame 选择数据
print(df1)
#              A    B   C   D
# 2021-04-01   0   10   2   3
# 2021-04-02   4   50   6   7
# 2021-04-03   8   90  10  11
# 2021-04-04  12  130  14  15
# 2021-04-05  16  170  18  19
# 2021-04-06  20  210  22  23

# 打印A列： df1['A']和df1.A用法一样的
print(df1['A'])
print(df1.A)
# 2021-04-01     0
# 2021-04-02     4
# 2021-04-03     8
# 2021-04-04    12
# 2021-04-05    16
# 2021-04-06    20
# Freq: D, Name: A, dtype: int64

# 打印下标为0、1、2行；# 并且打印D列
print(df1[0:3],df1.D)
#             A   B   C   D
# 2021-04-01  0  10   2   3
# 2021-04-02  4  50   6   7
# 2021-04-03  8  90  10  11

# 2021-04-01     3
# 2021-04-02     7
# 2021-04-03    11
# 2021-04-04    15
# 2021-04-05    19
# 2021-04-06    23
# Freq: D, Name: D, dtype: int64

print(df1['C']>20)
# 2021-04-01    False
# 2021-04-02    False
# 2021-04-03    False
# 2021-04-04    False
# 2021-04-05    False
# 2021-04-06     True
# Freq: D, Name: C, dtype: bool

print(df1[(df1.C>20)])
#              A    B   C   D
# 2021-04-06  20  210  22  23

print(df1[(df1.C>2)].C.isin([22]))
# 2021-04-02    False
# 2021-04-03    False
# 2021-04-04    False
# 2021-04-05    False
# 2021-04-06     True
# Freq: D, Name: C, dtype: bool

isin_df = df1[ df1.C.isin([22]) ]
print(isin_df)
#              A    B   C   D
# 2021-04-06  20  210  22  23

pandas数据处理

###############################################################
#pandas数据处理
###############################################################
l = ['转介绍渠道', '被介绍人订单号', '被介绍人学员ID', '被介绍人-赠送课时订单号', '被介绍人-赠送课时包ID（赠送课程）',
     '介绍人学员ID', '介绍人-赠送课时订单号', '介绍人-赠送课时包ID（赠送课程）', '转介绍赠送课程-套餐名', '赠送时间',
     '赠送课时数', '课程类型', '消耗年份', '消耗月份', '消耗课时', '剩余课时', '2020-02消耗',
     '2020-03消耗', '2020-04消耗', '2020-05消耗', '2020-08消耗', '2021-04消耗',
     '2020消耗', '2021消耗']

i = l.__contains__('2020-03消耗')
# print(i)


import os
import numpy as np
import pandas as pd
import datetime

out_path_excel = os.path.dirname(os.path.dirname(__file__)) + f'/data/test{datetime.date.today()}.xlsx'

arr = np.arange(50)
# print(arr)

df = pd.DataFrame(arr.reshape(10, 5), columns=['a', 'b', 'c', 'd', 'e'])


def style_color(df, colors):
    """

    :param df: pd.DataFrame
    :param colors: 字典 内容是 {标题:颜色}
    :return:
    """
    return df.style.apply(style_apply, colors=colors)


def style_apply(series, colors, back_ground=''):
    """
    :param series: 传过来的数据是DataFramt中的一列  类型为pd.Series
    :param colors: 内容是字典 其中key 为标题名  value 为颜色
    :param back_ground: 北京颜色
    :return:
    """
    series_name = series.name[0]
    a = list()
    # 为了给每一个单元格上色
    for col in series:
        # 其中 col 为pd.DataFrame 中的 一个小单元格  大家可以根据不同需求为单元格设置不同的颜色
        # 获取什么一级标题获取什么颜色
        if series_name in colors:
            for title_name in colors:
                if title_name == series_name:
                    back_ground = 'background-color: ' + colors[title_name]
                    # '; border-left-color: #080808'
        a.append(back_ground)
    return a


# style_df = style_color(df, {"a": '#1C1C1C', "b": '#00EEEE', "c": '#1A1A1A'})

# df = df.style.apply(lambda col: {"a": 'background-color:#1C1C1C', "b": 'background-color:#1C1C1C', "c": 'background-color:#1C1C1C'},axis=0)
# df = df.style.apply(style_df,colors = {"a": '#1C1C1C', "b": '#00EEEE', "c": '#1A1A1A'})

# print(style_df.to_excel(out_path_excel,sheet_name='sheet1',index=False))


def test():
    df = pd.DataFrame({
        'A': [1, 2, 3, 4, 5, 6, 7],
        'B': [0, 0, 3, 4, 5, 6, 7],
        'C1': [0, 0, 0, 4, 5, 6, 7],
        'C2': [11, 21, 31, 41, 15, 16, 17],
        'C3': [21, 2, 3, 4, 5, 6, 7],
        'D': [31, 2, 3, 4, 5, 6, 7],
    })
    melt_df = pd.melt(df, id_vars=['A', 'B', 'D'], value_vars=['C1', 'C2', 'C3'], var_name=['C的类型'], value_name='C的值')
    print(melt_df)

    # 只要有一列值为0就取出来
    res = melt_df.loc[(melt_df == 0).any(axis=1), :]

    # 分组求和
    res = res.groupby(['A', 'C的类型']).agg('sum')

    print(type(res.index), res.index)

    res_index = res.index

    # res['A'] = res_index.get_level_values('A')
    # res['C的类型'] = res_index.get_level_values('C的类型')

    # print(res)

    df2 = pd.DataFrame(np.zeros(40).reshape(10, 4), columns=['A', 'C', 'E', 'F'], dtype=object)

    df2.index.rename('a', True)

    # print(df2)

    res = pd.merge(res, df2, how='left', left_on=['A', 'C的类型'], right_on=['A', 'C'])

    # 填充所有列中空值为0
    res.fillna(0, inplace=True)

    # 过滤部分列值为0的数据 并且A的值大于1
    res = res[(res['B'] == 0) & (res["A"] > 1)]

    res = pd.merge(res, df2, how='left', left_on='A', right_on='A')

    print(res)


test()

xiaowei0548

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
Python基础48：pandas模块的介绍

# **************************************************************************# Python学习# **************************************************************************# ** 所属主题: 模块# ** 所属分层: 48 pandas模块的介绍# ** 功能描述: 48 pandas模块的介绍# ** 创建者: 陈红伟# ** 创建日期
复制链接

扫一扫