# **************************************************************************
# Python学习
# **************************************************************************
# ** 所属主题: 模块
# ** 所属分层: 48 pandas模块的介绍
# ** 功能描述: 48 pandas模块的介绍
# ** 创 建 者: 陈红伟
# ** 创建日期: 2021/4/22 11:55 下午
# **************************************************************************
# ** 修改日期 修改人 修改内容
# ** 2021/4/22 陈红伟 新增学习内容代码
# **************************************************************************
import pandas as pd
import numpy as np
import os
excel_path = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))+'/test/vipthink/fa/订单课耗2018数据2021-04-23.xlsx'
"""
pandas中Series序列:
"""
# =======================Series=======================
# 一、生成Series序列:
# 1】、创建Series序列方法1:
# s = pd.Series(1,2,3,np.nan,'2021-04-24',True) # TypeError: object of type 'int' has no len()
s = pd.Series([1,2,3,np.nan,'2021-04-24',True]) # 将Series的参数改为列表类型就OK了
print(s)
# 0 1
# 1 2
# 2 3
# 3 NaN
# 4 2021-04-24
# 5 True
# dtype: object
# 2】、创建Series序列方法2:
indexs = pd.date_range('2021-04-01',periods=6)
print(indexs)
# DatetimeIndex(['2021-04-01', '2021-04-02', '2021-04-03', '2021-04-04',
# '2021-04-05', '2021-04-06'],
# dtype='datetime64[ns]', freq='D')
"""
pandas中DataFrame:
"""
# =======================DataFrame=======================
# 一、创建DataFrame
# 1】、创建DataFrame方法1:
df1 = pd.DataFrame(data=np.arange(24).reshape(6,4),index=indexs,columns=['A','B','C','D'])
print(df1)
# A B C D
# 2021-04-01 0 1 2 3
# 2021-04-02 4 5 6 7
# 2021-04-03 8 9 10 11
# 2021-04-04 12 13 14 15
# 2021-04-05 16 17 18 19
# 2021-04-06 20 21 22 23
# 2】、创建DataFrame方法2:
df2 = pd.DataFrame(np.arange(24).reshape(6,4))
print(df2)
# 0 1 2 3
# 0 0 1 2 3
# 1 4 5 6 7
# 2 8 9 10 11
# 3 12 13 14 15
# 4 16 17 18 19
# 5 20 21 22 23
# 3】、创建DataFrame方法3:
df3 = pd.read_excel(excel_path)
print(df3)
# tttt id name ... 累计值 etl时间 是否是测试数据
# 0 111 2663 chw6523.41027278453 ... 2 2021-04-23 18:48:30 0
# 1 111 5174 chw8601.19119286537 ... 2 2021-04-23 18:48:30 0
# 2 111 7482 chw2497.0723176375 ... 2 2021-04-23 18:48:30 0
# 3 111 6987 lkp3488.16597834229 ... 2 2021-04-23 18:48:30 0
# 4 111 2941 lkp8045.83088494837 ... 2 2021-04-23 18:48:30 0
# [4 rows x 11 columns]
# 4】、创建DataFrame方法4:
df4 = pd.DataFrame({
'A':1,
'B':pd.Timestamp(20210424),
'C':np.array([3,4,5,6],dtype=int),
'D':["z","y","x","p"],
'E':1,
})
print(df4)
# A B C D E
# 0 1 1970-01-01 00:00:00.020210424 3 z 1
# 1 1 1970-01-01 00:00:00.020210424 4 y 1
# 2 1 1970-01-01 00:00:00.020210424 5 x 1
# 3 1 1970-01-01 00:00:00.020210424 6 p 1
# 二、DataFrame的属性
# 打印原来的DateFrame
print(df1)
# A B C D
# 2021-04-01 0 1 2 3
# 2021-04-02 4 5 6 7
# 2021-04-03 8 9 10 11
# 2021-04-04 12 13 14 15
# 2021-04-05 16 17 18 19
# 2021-04-06 20 21 22 23
# 将B列的值*10
df1.B = df1.B.apply(lambda value:value*10)
print(df1)
# A B C D
# 2021-04-01 0 10 2 3
# 2021-04-02 4 50 6 7
# 2021-04-03 8 90 10 11
# 2021-04-04 12 130 14 15
# 2021-04-05 16 170 18 19
# 2021-04-06 20 210 22 23
# 2.1 查看索引index
print(df1.index)
# DatetimeIndex(['2021-04-01', '2021-04-02', '2021-04-03', '2021-04-04',
# '2021-04-05', '2021-04-06'],
# dtype='datetime64[ns]', freq='D')
# 2.2 查看列名columns
print(df1.columns)
# Index(['A', 'B', 'C', 'D'], dtype='object')
# 2.3 查看具体的值
print(df1.values)
# [[ 0 10 2 3]
# [ 4 50 6 7]
# [ 8 90 10 11]
# [ 12 130 14 15]
# [ 16 170 18 19]
# [ 20 210 22 23]]
# 2.4 DataFrame的描述,只能针对类型是数字类型的列
print(df1.describe())
# A B C D
# count 6.000000 6.000000 6.000000 6.000000
# mean 10.000000 110.000000 12.000000 13.000000
# std 7.483315 74.833148 7.483315 7.483315
# min 0.000000 10.000000 2.000000 3.000000
# 25% 5.000000 60.000000 7.000000 8.000000
# 50% 10.000000 110.000000 12.000000 13.000000
# 75% 15.000000 160.000000 17.000000 18.000000
# max 20.000000 210.000000 22.000000 23.000000
# 2.5 排序
# 2.5.1 遍历行,按照列名 倒序(ascending=False)排序
print(df1.sort_index(axis=1,ascending=False))
# D C B A
# 2021-04-01 3 2 10 0
# 2021-04-02 7 6 50 4
# 2021-04-03 11 10 90 8
# 2021-04-04 15 14 130 12
# 2021-04-05 19 18 170 16
# 2021-04-06 23 22 210 20
# 列明倒序了
# 2.5.2 遍历列,按照索引名 倒序(ascending=False)排序
print(df1.sort_index(axis=0,ascending=False))
# A B C D
# 2021-04-06 20 210 22 23
# 2021-04-05 16 170 18 19
# 2021-04-04 12 130 14 15
# 2021-04-03 8 90 10 11
# 2021-04-02 4 50 6 7
# 2021-04-01 0 10 2 3
# 索引倒序了
# 2.5.3 遍历行,按照列值 倒序(ascending=False)排序
print(df1.sort_values(by='2021-04-04',axis=1,ascending=False))
# B D C A
# 2021-04-01 10 3 2 0
# 2021-04-02 50 7 6 4
# 2021-04-03 90 11 10 8
# 2021-04-04 130 15 14 12
# 2021-04-05 170 19 18 16
# 2021-04-06 210 23 22 20
# 三、DataFrame 选择数据
print(df1)
# A B C D
# 2021-04-01 0 10 2 3
# 2021-04-02 4 50 6 7
# 2021-04-03 8 90 10 11
# 2021-04-04 12 130 14 15
# 2021-04-05 16 170 18 19
# 2021-04-06 20 210 22 23
# 打印A列: df1['A']和df1.A用法一样的
print(df1['A'])
print(df1.A)
# 2021-04-01 0
# 2021-04-02 4
# 2021-04-03 8
# 2021-04-04 12
# 2021-04-05 16
# 2021-04-06 20
# Freq: D, Name: A, dtype: int64
# 打印下标为0、1、2行;# 并且打印D列
print(df1[0:3],df1.D)
# A B C D
# 2021-04-01 0 10 2 3
# 2021-04-02 4 50 6 7
# 2021-04-03 8 90 10 11
# 2021-04-01 3
# 2021-04-02 7
# 2021-04-03 11
# 2021-04-04 15
# 2021-04-05 19
# 2021-04-06 23
# Freq: D, Name: D, dtype: int64
print(df1['C']>20)
# 2021-04-01 False
# 2021-04-02 False
# 2021-04-03 False
# 2021-04-04 False
# 2021-04-05 False
# 2021-04-06 True
# Freq: D, Name: C, dtype: bool
print(df1[(df1.C>20)])
# A B C D
# 2021-04-06 20 210 22 23
print(df1[(df1.C>2)].C.isin([22]))
# 2021-04-02 False
# 2021-04-03 False
# 2021-04-04 False
# 2021-04-05 False
# 2021-04-06 True
# Freq: D, Name: C, dtype: bool
isin_df = df1[ df1.C.isin([22]) ]
print(isin_df)
# A B C D
# 2021-04-06 20 210 22 23
pandas数据处理
###############################################################
#pandas数据处理
###############################################################
l = ['转介绍渠道', '被介绍人订单号', '被介绍人学员ID', '被介绍人-赠送课时订单号', '被介绍人-赠送课时包ID(赠送课程)',
'介绍人学员ID', '介绍人-赠送课时订单号', '介绍人-赠送课时包ID(赠送课程)', '转介绍赠送课程-套餐名', '赠送时间',
'赠送课时数', '课程类型', '消耗年份', '消耗月份', '消耗课时', '剩余课时', '2020-02消耗',
'2020-03消耗', '2020-04消耗', '2020-05消耗', '2020-08消耗', '2021-04消耗',
'2020消耗', '2021消耗']
i = l.__contains__('2020-03消耗')
# print(i)
import os
import numpy as np
import pandas as pd
import datetime
out_path_excel = os.path.dirname(os.path.dirname(__file__)) + f'/data/test{datetime.date.today()}.xlsx'
arr = np.arange(50)
# print(arr)
df = pd.DataFrame(arr.reshape(10, 5), columns=['a', 'b', 'c', 'd', 'e'])
def style_color(df, colors):
"""
:param df: pd.DataFrame
:param colors: 字典 内容是 {标题:颜色}
:return:
"""
return df.style.apply(style_apply, colors=colors)
def style_apply(series, colors, back_ground=''):
"""
:param series: 传过来的数据是DataFramt中的一列 类型为pd.Series
:param colors: 内容是字典 其中key 为标题名 value 为颜色
:param back_ground: 北京颜色
:return:
"""
series_name = series.name[0]
a = list()
# 为了给每一个单元格上色
for col in series:
# 其中 col 为pd.DataFrame 中的 一个小单元格 大家可以根据不同需求为单元格设置不同的颜色
# 获取什么一级标题获取什么颜色
if series_name in colors:
for title_name in colors:
if title_name == series_name:
back_ground = 'background-color: ' + colors[title_name]
# '; border-left-color: #080808'
a.append(back_ground)
return a
# style_df = style_color(df, {"a": '#1C1C1C', "b": '#00EEEE', "c": '#1A1A1A'})
# df = df.style.apply(lambda col: {"a": 'background-color:#1C1C1C', "b": 'background-color:#1C1C1C', "c": 'background-color:#1C1C1C'},axis=0)
# df = df.style.apply(style_df,colors = {"a": '#1C1C1C', "b": '#00EEEE', "c": '#1A1A1A'})
# print(style_df.to_excel(out_path_excel,sheet_name='sheet1',index=False))
def test():
df = pd.DataFrame({
'A': [1, 2, 3, 4, 5, 6, 7],
'B': [0, 0, 3, 4, 5, 6, 7],
'C1': [0, 0, 0, 4, 5, 6, 7],
'C2': [11, 21, 31, 41, 15, 16, 17],
'C3': [21, 2, 3, 4, 5, 6, 7],
'D': [31, 2, 3, 4, 5, 6, 7],
})
melt_df = pd.melt(df, id_vars=['A', 'B', 'D'], value_vars=['C1', 'C2', 'C3'], var_name=['C的类型'], value_name='C的值')
print(melt_df)
# 只要有一列值为0就取出来
res = melt_df.loc[(melt_df == 0).any(axis=1), :]
# 分组求和
res = res.groupby(['A', 'C的类型']).agg('sum')
print(type(res.index), res.index)
res_index = res.index
# res['A'] = res_index.get_level_values('A')
# res['C的类型'] = res_index.get_level_values('C的类型')
# print(res)
df2 = pd.DataFrame(np.zeros(40).reshape(10, 4), columns=['A', 'C', 'E', 'F'], dtype=object)
df2.index.rename('a', True)
# print(df2)
res = pd.merge(res, df2, how='left', left_on=['A', 'C的类型'], right_on=['A', 'C'])
# 填充所有列中空值为0
res.fillna(0, inplace=True)
# 过滤部分列值为0的数据 并且A的值大于1
res = res[(res['B'] == 0) & (res["A"] > 1)]
res = pd.merge(res, df2, how='left', left_on='A', right_on='A')
print(res)
test()