Python数据分析复习整理(Pandas)

Pandas

1.Series

Series是一种一维数组

import pandas as pd
# 通过列表创建Series
list1 = [1,-2,3,-4]
obj = pd.Series(list1)
obj
0    1
1   -2
2    3
3   -4
dtype: int64
# 创建Series时指定索引
index = ["a","b","c","d"]
values = [2,5,7,8]
obj = pd.Series(values,index=index,name='指定索引')
obj
a    2
b    5
c    7
d    8
Name: 指定索引, dtype: int64
# 通过字典创建Series
dict1 = {'张三':2000,'李四':800,'王五':30,'刘六':500}
obj = pd.Series(dict1)
obj
张三    2000
李四     800
王五      30
刘六     500
dtype: int64
# 通过字典创建Series
data = {'张三':2000,'李四':800,'王五':30,'刘六':500}
index = ['张三','李四','王五','刘六','张三一']
# 索引不匹配时自动nan补全
obj = pd.Series(dict1,index=index)
obj
张三     2000.0
李四      800.0
王五       30.0
刘六      500.0
张三一       NaN
dtype: float64
values = [2,5,7,8]
obj = pd.Series(values)
# 修改索引
obj.index = ["a","b","c","d"]
obj
a    2
b    5
c    7
d    8
dtype: int64

DataFrame

# 创建DataFrame
data = {
    'A':[2,3,5,2,3],
    'B':[5,0,2,3,6],
    'C':[8,7,50,8,2],
    'key':[3,4,5,2,2]
}
df = pd.DataFrame(data)
df
ABCkey
02583
13074
252505
32382
43622
# 进行列名排序
# columns函数指定列名
columns = ['C','B','A','key']
df = pd.DataFrame(data,columns=columns)
df
CBAkey
08523
17034
250255
38322
42632
df
CBAkey
08523
17034
250255
38322
42632
# 索引
# DataFrame的index不可修改
df.index
RangeIndex(start=0, stop=5, step=1)
# 列名
df.columns
Index(['C', 'B', 'A', 'key'], dtype='object')
df
CBAkey
08523
17034
250255
38322
42632
# 所有值
df.values
array([[ 8,  5,  2,  3],
       [ 7,  0,  3,  4],
       [50,  2,  5,  5],
       [ 8,  3,  2,  2],
       [ 2,  6,  3,  2]], dtype=int64)
# 所有列
df.columns
Index(['C', 'B', 'A', 'key'], dtype='object')
# 元素个数
df.size
20
# 元素的维度
df.ndim
2
# 元素的形状
df.shape
(5, 4)
data = pd.read_csv(r'D:\Desktop\大三第二学期\Python实训\数据\members.csv')
data
会员卡号会员等级注册时间所属店铺编码性别生日年齡生命级别
0BL6099033097969709934黄金会员2019-03-30 17:16:17.141DPJ0091988-12-2030.0活跃
1BL6099033094943300594白银会员2019-03-30 14:14:38.576DPJ0091995-05-2023.0活跃
2BL6099033099933500943白银会员2019-03-30 11:13:35.469DPJ0091990-09-0528.0沉睡
3BL6099036369935609966白银会员2019-03-28 21:13:52.917DPJ0091971-07-2347.0沉睡
4BL6099036393309900969黄金会员2019-03-28 18:30:19.672DPJ0091990-09-1528.0活跃
5BL6099036395563900606白银会员2019-03-28 15:52:31.583DPJ0091980-06-0438.0沉睡
6BL6099090903509300066白银会员2019-01-01 08:50:13.267DPJ0081969-12-1949.0沉睡
7BL6093906999066500093白银会员2018-10-29 11:06:25.902DPJ0082000-02-2719.0沉睡
8BL6093906395604900473白银会员2018-10-28 15:20:49.183DPJ0081969-10-2649.0沉睡
9BL6093906793643300676白银会员2018-10-27 13:24:38.805DPJ0081989-11-0429.0沉睡
# 选取一列
w1 = data['会员等级']
w1
0    黄金会员
1    白银会员
2    白银会员
3    白银会员
4    黄金会员
5    白银会员
6    白银会员
7    白银会员
8    白银会员
9    白银会员
Name: 会员等级, dtype: object
# 选取多列
w1 = data[['会员等级','性别']]
w1
会员等级性别
0黄金会员
1白银会员
2白银会员
3白银会员
4黄金会员
5白银会员
6白银会员
7白银会员
8白银会员
9白银会员
# 选取一行
data[1:2]
会员卡号会员等级注册时间所属店铺编码性别生日年齡生命级别
1BL6099033094943300594白银会员2019-03-30 14:14:38.576DPJ0091995-05-2023.0活跃
# 选取多行
data[1:5]
会员卡号会员等级注册时间所属店铺编码性别生日年齡生命级别
1BL6099033094943300594白银会员2019-03-30 14:14:38.576DPJ0091995-05-2023.0活跃
2BL6099033099933500943白银会员2019-03-30 11:13:35.469DPJ0091990-09-0528.0沉睡
3BL6099036369935609966白银会员2019-03-28 21:13:52.917DPJ0091971-07-2347.0沉睡
4BL6099036393309900969黄金会员2019-03-28 18:30:19.672DPJ0091990-09-1528.0活跃
# 选取会员等级和性别两列
data.loc[:,['会员等级','性别']]
会员等级性别
0黄金会员
1白银会员
2白银会员
3白银会员
4黄金会员
5白银会员
6白银会员
7白银会员
8白银会员
9白银会员
# 选取第1行和第3行的会员等级和性别两列
data.loc[[1,3],['会员等级','性别']]
会员等级性别
1白银会员
3白银会员
# 选取第1行和第3行
data.loc[[1,3],:]
会员卡号会员等级注册时间所属店铺编码性别生日年齡生命级别
1BL6099033094943300594白银会员2019-03-30 14:14:38.576DPJ0091995-05-2023.0活跃
3BL6099036369935609966白银会员2019-03-28 21:13:52.917DPJ0091971-07-2347.0沉睡
# 选取第1行和第3行
# 前闭后开区间
data.iloc[1:3,:]
会员卡号会员等级注册时间所属店铺编码性别生日年齡生命级别
1BL6099033094943300594白银会员2019-03-30 14:14:38.576DPJ0091995-05-2023.0活跃
2BL6099033099933500943白银会员2019-03-30 11:13:35.469DPJ0091990-09-0528.0沉睡
# 选取第1行和第3行
# 前闭后闭区间
data.loc[1:3,:]
会员卡号会员等级注册时间所属店铺编码性别生日年齡生命级别
1BL6099033094943300594白银会员2019-03-30 14:14:38.576DPJ0091995-05-2023.0活跃
2BL6099033099933500943白银会员2019-03-30 11:13:35.469DPJ0091990-09-0528.0沉睡
3BL6099036369935609966白银会员2019-03-28 21:13:52.917DPJ0091971-07-2347.0沉睡
# 筛选女生
data[data['性别']=='女']
会员卡号会员等级注册时间所属店铺编码性别生日年齡生命级别
1BL6099033094943300594白银会员2019-03-30 14:14:38.576DPJ0091995-05-2023.0活跃
2BL6099033099933500943白银会员2019-03-30 11:13:35.469DPJ0091990-09-0528.0沉睡
3BL6099036369935609966白银会员2019-03-28 21:13:52.917DPJ0091971-07-2347.0沉睡
4BL6099036393309900969黄金会员2019-03-28 18:30:19.672DPJ0091990-09-1528.0活跃
7BL6093906999066500093白银会员2018-10-29 11:06:25.902DPJ0082000-02-2719.0沉睡
9BL6093906793643300676白银会员2018-10-27 13:24:38.805DPJ0081989-11-0429.0沉睡
data
会员卡号会员等级注册时间所属店铺编码性别生日年齡生命级别
0BL6099033097969709934黄金会员2019-03-30 17:16:17.141DPJ0091988-12-2030.0活跃
1BL6099033094943300594白银会员2019-03-30 14:14:38.576DPJ0091995-05-2023.0活跃
2BL6099033099933500943白银会员2019-03-30 11:13:35.469DPJ0091990-09-0528.0沉睡
3BL6099036369935609966白银会员2019-03-28 21:13:52.917DPJ0091971-07-2347.0沉睡
4BL6099036393309900969黄金会员2019-03-28 18:30:19.672DPJ0091990-09-1528.0活跃
5BL6099036395563900606白银会员2019-03-28 15:52:31.583DPJ0091980-06-0438.0沉睡
6BL6099090903509300066白银会员2019-01-01 08:50:13.267DPJ0081969-12-1949.0沉睡
7BL6093906999066500093白银会员2018-10-29 11:06:25.902DPJ0082000-02-2719.0沉睡
8BL6093906395604900473白银会员2018-10-28 15:20:49.183DPJ0081969-10-2649.0沉睡
9BL6093906793643300676白银会员2018-10-27 13:24:38.805DPJ0081989-11-0429.0沉睡
data = data.loc[:,['所属店铺编码','性别','生日','年齡']]
data
所属店铺编码性别生日年齡
0DPJ0091988-12-2030.0
1DPJ0091995-05-2023.0
2DPJ0091990-09-0528.0
3DPJ0091971-07-2347.0
4DPJ0091990-09-1528.0
5DPJ0091980-06-0438.0
6DPJ0081969-12-1949.0
7DPJ0082000-02-2719.0
8DPJ0081969-10-2649.0
9DPJ0081989-11-0429.0
# 增加一行数据
data_add = {'所属店铺编码':'DPJ000','性别':'男','生日':'2000-11-11','年齡':'20.0'}
data = data.append(data_add,ignore_index=True)
所属店铺编码性别生日年齡
0DPJ0091988-12-2030
1DPJ0091995-05-2023
2DPJ0091990-09-0528
3DPJ0091971-07-2347
4DPJ0091990-09-1528
5DPJ0091980-06-0438
6DPJ0081969-12-1949
7DPJ0082000-02-2719
8DPJ0081969-10-2649
9DPJ0081989-11-0429
10DPJ0002000-11-1120.0
# 增加一列数据
data['学历'] = '本科'
data
所属店铺编码性别生日年齡学历
0DPJ0091988-12-2030.0本科
1DPJ0091995-05-2023.0本科
2DPJ0091990-09-0528.0本科
3DPJ0091971-07-2347.0本科
4DPJ0091990-09-1528.0本科
5DPJ0091980-06-0438.0本科
6DPJ0081969-12-1949.0本科
7DPJ0082000-02-2719.0本科
8DPJ0081969-10-2649.0本科
9DPJ0081989-11-0429.0本科
# 增加一列数据
data['薪资'] = [1000,200,3000,5000,10000,400,5000,6000,7555,8000]
data
所属店铺编码性别生日年齡学历薪资
0DPJ0091988-12-2030.0本科1000
1DPJ0091995-05-2023.0本科200
2DPJ0091990-09-0528.0本科3000
3DPJ0091971-07-2347.0本科5000
4DPJ0091990-09-1528.0本科10000
5DPJ0091980-06-0438.0本科400
6DPJ0081969-12-1949.0本科5000
7DPJ0082000-02-2719.0本科6000
8DPJ0081969-10-2649.0本科7555
9DPJ0081989-11-0429.0本科8000
# 删除一行
data.drop(4,axis=0)
所属店铺编码性别生日年齡学历薪资
0DPJ0091988-12-2030.0本科1000
1DPJ0091995-05-2023.0本科200
2DPJ0091990-09-0528.0本科3000
3DPJ0091971-07-2347.0本科5000
5DPJ0091980-06-0438.0本科400
6DPJ0081969-12-1949.0本科5000
7DPJ0082000-02-2719.0本科6000
8DPJ0081969-10-2649.0本科7555
9DPJ0081989-11-0429.0本科8000
# 删除一行并保存
data.drop(5,axis=0,inplace=True)
data
所属店铺编码性别生日年齡学历薪资
0DPJ0091988-12-2030.0本科1000
1DPJ0091995-05-2023.0本科200
2DPJ0091990-09-0528.0本科3000
3DPJ0091971-07-2347.0本科5000
6DPJ0081969-12-1949.0本科5000
7DPJ0082000-02-2719.0本科6000
8DPJ0081969-10-2649.0本科7555
9DPJ0081989-11-0429.0本科8000
# 删除一列并保存
data.drop('薪资',axis=1,inplace=True)
data
所属店铺编码性别生日年齡学历
0DPJ0091988-12-2030.0本科
1DPJ0091995-05-2023.0本科
2DPJ0091990-09-0528.0本科
3DPJ0091971-07-2347.0本科
6DPJ0081969-12-1949.0本科
7DPJ0082000-02-2719.0本科
8DPJ0081969-10-2649.0本科
9DPJ0081989-11-0429.0本科
# 对索引进行排序(ascending=False)为逆序
data.sort_index(ascending=False)
所属店铺编码性别生日年齡学历
9DPJ0081989-11-0429.0本科
8DPJ0081969-10-2649.0本科
7DPJ0082000-02-2719.0本科
6DPJ0081969-12-1949.0本科
3DPJ0091971-07-2347.0本科
2DPJ0091990-09-0528.0本科
1DPJ0091995-05-2023.0本科
0DPJ0091988-12-2030.0本科
# 对年齡进行排序
data.sort_values(by='年齡')
所属店铺编码性别生日年齡学历
7DPJ0082000-02-2719.0本科
1DPJ0091995-05-2023.0本科
2DPJ0091990-09-0528.0本科
9DPJ0081989-11-0429.0本科
0DPJ0091988-12-2030.0本科
3DPJ0091971-07-2347.0本科
6DPJ0081969-12-1949.0本科
8DPJ0081969-10-2649.0本科
df
CBAkey
08523
17034
250255
38322
42632
# 对每个数值型列进行统计
df.describe()
CBAkey
count5.0000005.0000005.0000005.00000
mean15.0000003.2000003.0000003.20000
std19.7230832.3874671.2247451.30384
min2.0000000.0000002.0000002.00000
25%7.0000002.0000002.0000002.00000
50%8.0000003.0000003.0000003.00000
75%8.0000005.0000003.0000004.00000
max50.0000006.0000005.0000005.00000
# 对每列进行求和
df.sum()
C      75
B      16
A      15
key    16
dtype: int64
# 对每行进行求和
df.sum(axis=1)
0    18
1    14
2    62
3    15
4    13
dtype: int64
data
所属店铺编码性别生日年齡学历
0DPJ0091988-12-2030.0本科
1DPJ0091995-05-2023.0本科
2DPJ0091990-09-0528.0本科
3DPJ0091971-07-2347.0本科
6DPJ0081969-12-1949.0本科
7DPJ0082000-02-2719.0本科
8DPJ0081969-10-2649.0本科
9DPJ0081989-11-0429.0本科
# 对所属店铺编码列进行去重
data['所属店铺编码'].unique()
array(['DPJ009', 'DPJ008'], dtype=object)
# 对每个所属店铺编码进行计次统计
data['所属店铺编码'].value_counts()
DPJ009    4
DPJ008    4
Name: 所属店铺编码, dtype: int64
import pandas as pd
import numpy as np
df = pd.DataFrame({
    'key1':['a','b','a','b','b'],
    'key2':['yes','no','yes','yes','no'],
    'data1':np.arange(5),
    'data2':np.random.rand(5)
})
df
key1key2data1data2
0ayes00.775070
1bno10.641297
2ayes20.076019
3byes30.068691
4bno40.983585
# 按列名分组
df.groupby('key2').count()
key1data1data2
key2
no222
yes333
# 按列表或元组分析
wlist = ['w','w','y','w','w']
df.groupby(wlist).sum()
data1data2
w81.929771
y20.937298
df
key1key2data1data2
0ayes00.775070
1bno10.641297
2ayes20.076019
3byes30.068691
4bno40.983585
wdict = {0:'one',1:'one',2:'two',3:'three',4:'two'}
df.groupby(wdict).count()
key1key2data1data2
one2222
three1111
two2222
# 按函数分组
def fenzu(x):
    if x> 0.5:
        return 'a'
    else:
        return 'b'
df.groupby(df['data2'].map(fenzu)).count()
key1key2data1data2
data2
a3333
b2222
# 数据聚合(对分组的数据进行计算,产生标量值的数据传递过程)
# 计数
df.groupby(df['data2'].map(fenzu)).count()
# 求和
df.groupby(df['data2'].map(fenzu)).sum()
# 均值
df.groupby(df['data2'].map(fenzu)).mean()
# 中位数
df.groupby(df['data2'].map(fenzu)).median()
# 标准差
df.groupby(df['data2'].map(fenzu)).std()
# 方差
df.groupby(df['data2'].map(fenzu)).var()
# 最小值
df.groupby(df['data2'].map(fenzu)).min()
# 最大值
df.groupby(df['data2'].map(fenzu)).max()
# 求积
df.groupby(df['data2'].map(fenzu)).prod()
# 第一个值
df.groupby(df['data2'].map(fenzu)).first()
# 最后一个值
df.groupby(df['data2'].map(fenzu)).last()
key1key2data1data2
data2
abno40.983585
bbyes30.068691
# agg聚合数据
df[['data1','data2']].agg([np.sum,np.mean,np.min])
data1data2
sum10.02.544661
mean2.00.508932
amin0.00.068691
# agg聚合数据
df.agg({'data1':np.sum,'data2':[np.mean,np.min]})
data1data2
aminNaN0.068691
meanNaN0.508932
sum10.0NaN
# transform方法
df.groupby(['data2','data1'])['data1','data2'].transform('mean')
data1data2
330.068691
000.775070
220.076019
440.983585
110.641297
```python import pandas as pd ```
members = pd.read_csv(r'D:\Desktop\大三第二学期\Python实训\数据\members.csv')
members
会员卡号会员等级注册时间所属店铺编码性别生日年齡生命级别
0BL6099033097969709934黄金会员2019-03-30 17:16:17.141DPJ0091988-12-2030.0活跃
1BL6099033094943300594白银会员2019-03-30 14:14:38.576DPJ0091995-05-2023.0活跃
2BL6099033099933500943白银会员2019-03-30 11:13:35.469DPJ0091990-09-0528.0沉睡
3BL6099036369935609966白银会员2019-03-28 21:13:52.917DPJ0091971-07-2347.0沉睡
4BL6099036393309900969黄金会员2019-03-28 18:30:19.672DPJ0091990-09-1528.0活跃
5BL6099036395563900606白银会员2019-03-28 15:52:31.583DPJ0091980-06-0438.0沉睡
6BL6099090903509300066白银会员2019-01-01 08:50:13.267DPJ0081969-12-1949.0沉睡
7BL6093906999066500093白银会员2018-10-29 11:06:25.902DPJ0082000-02-2719.0沉睡
8BL6093906395604900473白银会员2018-10-28 15:20:49.183DPJ0081969-10-2649.0沉睡
9BL6093906793643300676白银会员2018-10-27 13:24:38.805DPJ0081989-11-0429.0沉睡
pd.crosstab(index=members['会员等级'],columns=members['性别'])
性别
会员等级
白银会员53
黄金会员11
pd.crosstab(index=members['会员等级'],columns=members['性别'],values=members['年齡'],aggfunc='mean')
性别
会员等级
白银会员29.245.333333
黄金会员28.030.000000

透视表

# 查看各等级会员的平均年龄
members.pivot_table(values='年齡',index='会员等级')
年齡
会员等级
白银会员35.25
黄金会员29.00
# 查看各等级每一个性别的平均年龄
members.pivot_table(values='年齡',index='会员等级',columns='性别')
性别
会员等级
白银会员29.245.333333
黄金会员28.030.000000
# 查看各店铺,各等级每一性别的平均年龄
members.pivot_table(values='年齡',index=['所属店铺编码','会员等级'],columns='性别')
性别
所属店铺编码会员等级
DPJ008白银会员24.00000049.0
DPJ009白银会员32.66666738.0
黄金会员28.00000030.0
import pandas as pd
# 1.read_excel
orders = pd.read_excel(r'D:\Desktop\大三第二学期\Python实训\数据\会员消费报表.xlsx')
stores = pd.read_excel(r'D:\Desktop\大三第二学期\Python实训\数据\门店信息表.xlsx')
# 1.read_csv
orders = pd.read_csv(r'D:\Desktop\大三第二学期\Python实训\数据\会员消费报表.csv')
stores = pd.read_csv(r'D:\Desktop\大三第二学期\Python实训\数据\门店信息表.csv',usecols=['店铺代码', '地区编码'])
orders.head()
卡号订单日期订单号订单类型店铺代码款号尺码消费数量消费金额当前积分
0HS340766JAF62018-11-30ODLOX6BXX8X2BXBBBBX下单DPX60XBLA267Q3X13AQM23011200.0800
1BL60930399994656035902018-11-30ODODOXF77X8X2BXBBBB2下单DPX377BLA2651QX14AQC24011332.02531
2BL60939095939396004072018-11-30ROX8XXFBBBB6BB退单DPS00XTMA27727X5QAQM240-1-112.5328
3BL60930360960307093942018-11-30ROX8XXFBBBB6X7退单DPS00XTMA67621X5QBQTM230-1-60.01038
4BL60939930669437006502018-11-30ODLOX6BFX8XXFBBBBBX下单DPX603BLA26663X52AQTM23511200.0800
stores.head()
店铺代码地区编码
0DPX00XGBL6020
1DPX002GBL6020
2DPX003GBL6020
3DPX004GBL6020
4DPX005GBL6020
data = stores[0:10]
data
店铺代码地区编码
0DPX00XGBL6020
1DPX002GBL6020
2DPX003GBL6020
3DPX004GBL6020
4DPX005GBL6020
5DPX006GBL6020
6DPX007GBL6020
7DPX008GBL6020
8DPX0X0GBL6020
9DPX0X2GBL6020
# 导出为csv文件
data.to_csv('test.csv')
# 导出为Excel文件
data.to_excel('test.xlsx')
# merge函数
pd.merge(orders,stores,left_on='店铺代码',right_on='店铺代码')
卡号订单日期订单号订单类型店铺代码款号尺码消费数量消费金额当前积分地区编码
0HS340766JAF62018-11-30ODLOX6BXX8X2BXBBBBX下单DPX60XBLA267Q3X13AQM23011200.0800GBL6140
1BL60939964696657090642018-11-28ODROX6BXX8XX28BBBB2退单DPX60XBLA26725X13AQTM235-1-1200.0100GBL6140
2BL60939964696657090642018-11-28ODLOX6BXX8XX28BBBBX下单DPX60XBLA26725X13AQTM23511200.0900GBL6140
3BL60939964696657090642018-11-27ODROX6BXX8XX29BBBBX退单DPX60XBLA26725X13AQTM235-1-1200.0900GBL6140
4BL60939964696657090642018-11-27ODLOX6BXX8XX28BBBB2下单DPX60XBLA26725X13AQTM23511200.01700GBL6140
....................................
203432BL60930360605495033962019-01-12ODLOX9B2X9BXX2BBBBX下单DPX902BLA26516X1QAQG23511948.51633GBL6070
203433BL60990909699494005652019-01-09ODLOX892X9BXB9BBBBX下单DPX892BLA265Q4X11TMQQ23011153.5869GBL6130
203434BL60930567695760033662019-01-08ODODOX2B2X9BXB8BBBBX下单DPX202BLAQ2112X6QBQQ21511198.5899GBL6090
203435BL60939606940356009462019-01-01ODLOX9BFX9BXBXBBBBX下单DPX903TMB265L5X62N2123011198.51000GBL6140
203436BL60939606940356009462019-01-01ODLOX9BFX9BXBXBBBBX下单DPX903BLA27714X2QBQG23011500.01000GBL6140

203437 rows × 11 columns

# merge函数,左连接
pd.merge(orders,stores,how='left')
卡号订单日期订单号订单类型店铺代码款号尺码消费数量消费金额当前积分地区编码
0HS340766JAF62018-11-30ODLOX6BXX8X2BXBBBBX下单DPX60XBLA267Q3X13AQM23011200.0800GBL6140
1BL60930399994656035902018-11-30ODODOXF77X8X2BXBBBB2下单DPX377BLA2651QX14AQC24011332.02531GBL6070
2BL60939095939396004072018-11-30ROX8XXFBBBB6BB退单DPS00XTMA27727X5QAQM240-1-112.5328GBL6D01
3BL60930360960307093942018-11-30ROX8XXFBBBB6X7退单DPS00XTMA67621X5QBQTM230-1-60.01038GBL6D01
4BL60939930669437006502018-11-30ODLOX6BFX8XXFBBBBBX下单DPX603BLA26663X52AQTM23511200.0800GBL6140
....................................
203432BL60939695940699009552019-01-01ODROX68BX9BXBXBBBBX退单DPX680BL44P7QXQ1XXA230-1-1.50GBL6110
203433BL60939695940699009552019-01-01ODROX68BX9BXBXBBBBX退单DPX680BLA26766X5QC3TM235-1-1707.00GBL6110
203434BL60990909909959000302019-01-01ODLOX68BX9BXBXBBBBF下单DPX680BLA26663X56BQTM23511011.0337GBL6110
203435BL60990909095903000632019-01-01ODLOX22XX9BXBXBBBBF下单DPX22XBLA27522X16AQTM23511032.0344GBL6050
203436BL60990909093437000652019-01-01ODLOX225X9BXBXBBBBX下单DPX225BLA26517X11CQM2351598.50GBL6050

203437 rows × 11 columns

# merge函数,右链接
pd.merge(orders,stores,how='right')
卡号订单日期订单号订单类型店铺代码款号尺码消费数量消费金额当前积分地区编码
0HS340766JAF62018-11-30ODLOX6BXX8X2BXBBBBX下单DPX60XBLA267Q3X13AQM230.01.01200.0800.0GBL6140
1BL60939964696657090642018-11-28ODROX6BXX8XX28BBBB2退单DPX60XBLA26725X13AQTM235.0-1.0-1200.0100.0GBL6140
2BL60939964696657090642018-11-28ODLOX6BXX8XX28BBBBX下单DPX60XBLA26725X13AQTM235.01.01200.0900.0GBL6140
3BL60939964696657090642018-11-27ODROX6BXX8XX29BBBBX退单DPX60XBLA26725X13AQTM235.0-1.0-1200.0900.0GBL6140
4BL60939964696657090642018-11-27ODLOX6BXX8XX28BBBB2下单DPX60XBLA26725X13AQTM235.01.01200.01700.0GBL6140
....................................
203580NaNNaNNaNNaNDPJ030NaNNaNNaNNaNNaNGBL6080
203581NaNNaNNaNNaNDPJ033NaNNaNNaNNaNNaNGBL6110
203582NaNNaNNaNNaNDPJ034NaNNaNNaNNaNNaNGBL6010
203583NaNNaNNaNNaNDPJ036NaNNaNNaNNaNNaNGBL6120
203584NaNNaNNaNNaNDPJ04XNaNNaNNaNNaNNaNGBL6070

203585 rows × 11 columns

# concat数据连接
# 连接两个Series
s1 = pd.Series([0,1],index=['a','b'])
s2 = pd.Series([0,1,3],index=['e','d','c'])
s3 = pd.Series([0,2],index=['a','e'])
pd.concat([s1,s2,s3])
a    0
b    1
e    0
d    1
c    3
a    0
e    2
dtype: int64
# 连接两个DataFrame
data1 = orders[0:10]
data2 = stores[0:5]
data = pd.concat([data1,data2])
data
卡号订单日期订单号订单类型店铺代码款号尺码消费数量消费金额当前积分地区编码
0HS340766JAF62018-11-30ODLOX6BXX8X2BXBBBBX下单DPX60XBLA267Q3X13AQM230.01.01200.0800.0NaN
1BL60930399994656035902018-11-30ODODOXF77X8X2BXBBBB2下单DPX377BLA2651QX14AQC240.01.01332.02531.0NaN
2BL60939095939396004072018-11-30ROX8XXFBBBB6BB退单DPS00XTMA27727X5QAQM240.0-1.0-112.5328.0NaN
3BL60930360960307093942018-11-30ROX8XXFBBBB6X7退单DPS00XTMA67621X5QBQTM230.0-1.0-60.01038.0NaN
4BL60939930669437006502018-11-30ODLOX6BFX8XXFBBBBBX下单DPX603BLA26663X52AQTM235.01.01200.0800.0NaN
5BL60939930660639006392018-11-30ODLOX78MX8XXFBBBBB6下单DPX784BLA26515X1QBQQ230.01.0598.5899.0NaN
6BL60939930660639006392018-11-30ODLOX78MX8XXFBBBBB6下单DPX784BLA26524X13AQS225.01.0898.5899.0NaN
7BL60939930660009006352018-11-30ODLOX78MX8XXFBBBBB5下单DPX784BLA265Q3X13AQM235.01.01050.0700.0NaN
8BL60930499603999097032018-11-30ODLOXM8BX8XXFBBBBB6下单DPX480BLA67712X15BQTM235.01.0598.5930.0NaN
9BL60930499603999097032018-11-30ODLOXM8BX8XXFBBBBB6下单DPX480BLA26663X5QAQTM235.01.01050.0930.0NaN
0NaNNaNNaNNaNDPX00XNaNNaNNaNNaNNaNGBL6020
1NaNNaNNaNNaNDPX002NaNNaNNaNNaNNaNGBL6020
2NaNNaNNaNNaNDPX003NaNNaNNaNNaNNaNGBL6020
3NaNNaNNaNNaNDPX004NaNNaNNaNNaNNaNGBL6020
4NaNNaNNaNNaNDPX005NaNNaNNaNNaNNaNGBL6020
# 检测缺失值
data.isnull()
卡号订单日期订单号订单类型店铺代码款号尺码消费数量消费金额当前积分地区编码
0FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
1FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
2FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
3FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
4FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
5FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
6FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
7FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
8FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
9FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
0TrueTrueTrueTrueFalseTrueTrueTrueTrueTrueFalse
1TrueTrueTrueTrueFalseTrueTrueTrueTrueTrueFalse
2TrueTrueTrueTrueFalseTrueTrueTrueTrueTrueFalse
3TrueTrueTrueTrueFalseTrueTrueTrueTrueTrueFalse
4TrueTrueTrueTrueFalseTrueTrueTrueTrueTrueFalse
# 缺失值统计
data.isnull().sum()
卡号       5
订单日期     5
订单号      5
订单类型     5
店铺代码     0
款号       5
尺码       5
消费数量     5
消费金额     5
当前积分     5
地区编码    10
dtype: int64
# 检测非缺失值
data.notnull()
卡号订单日期订单号订单类型店铺代码款号尺码消费数量消费金额当前积分地区编码
0TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueFalse
1TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueFalse
2TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueFalse
3TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueFalse
4TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueFalse
5TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueFalse
6TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueFalse
7TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueFalse
8TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueFalse
9TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueFalse
0FalseFalseFalseFalseTrueFalseFalseFalseFalseFalseTrue
1FalseFalseFalseFalseTrueFalseFalseFalseFalseFalseTrue
2FalseFalseFalseFalseTrueFalseFalseFalseFalseFalseTrue
3FalseFalseFalseFalseTrueFalseFalseFalseFalseFalseTrue
4FalseFalseFalseFalseTrueFalseFalseFalseFalseFalseTrue
# 删除缺失值
data1 = data.dropna()
data1
卡号订单日期订单号订单类型店铺代码款号尺码消费数量消费金额当前积分地区编码
# 用fillna填充缺失值
data2 = data.fillna(0)
data2
卡号订单日期订单号订单类型店铺代码款号尺码消费数量消费金额当前积分地区编码
0HS340766JAF62018-11-30ODLOX6BXX8X2BXBBBBX下单DPX60XBLA267Q3X13AQM230.01.01200.0800.00
1BL60930399994656035902018-11-30ODODOXF77X8X2BXBBBB2下单DPX377BLA2651QX14AQC240.01.01332.02531.00
2BL60939095939396004072018-11-30ROX8XXFBBBB6BB退单DPS00XTMA27727X5QAQM240.0-1.0-112.5328.00
3BL60930360960307093942018-11-30ROX8XXFBBBB6X7退单DPS00XTMA67621X5QBQTM230.0-1.0-60.01038.00
4BL60939930669437006502018-11-30ODLOX6BFX8XXFBBBBBX下单DPX603BLA26663X52AQTM235.01.01200.0800.00
5BL60939930660639006392018-11-30ODLOX78MX8XXFBBBBB6下单DPX784BLA26515X1QBQQ230.01.0598.5899.00
6BL60939930660639006392018-11-30ODLOX78MX8XXFBBBBB6下单DPX784BLA26524X13AQS225.01.0898.5899.00
7BL60939930660009006352018-11-30ODLOX78MX8XXFBBBBB5下单DPX784BLA265Q3X13AQM235.01.01050.0700.00
8BL60930499603999097032018-11-30ODLOXM8BX8XXFBBBBB6下单DPX480BLA67712X15BQTM235.01.0598.5930.00
9BL60930499603999097032018-11-30ODLOXM8BX8XXFBBBBB6下单DPX480BLA26663X5QAQTM235.01.01050.0930.00
00000DPX00X00.00.00.00.0GBL6020
10000DPX00200.00.00.00.0GBL6020
20000DPX00300.00.00.00.0GBL6020
30000DPX00400.00.00.00.0GBL6020
40000DPX00500.00.00.00.0GBL6020
# 用duplicates删除卡号
data2.drop_duplicates()
卡号订单日期订单号订单类型店铺代码款号尺码消费数量消费金额当前积分地区编码
0HS340766JAF62018-11-30ODLOX6BXX8X2BXBBBBX下单DPX60XBLA267Q3X13AQM230.01.01200.0800.00
1BL60930399994656035902018-11-30ODODOXF77X8X2BXBBBB2下单DPX377BLA2651QX14AQC240.01.01332.02531.00
2BL60939095939396004072018-11-30ROX8XXFBBBB6BB退单DPS00XTMA27727X5QAQM240.0-1.0-112.5328.00
3BL60930360960307093942018-11-30ROX8XXFBBBB6X7退单DPS00XTMA67621X5QBQTM230.0-1.0-60.01038.00
4BL60939930669437006502018-11-30ODLOX6BFX8XXFBBBBBX下单DPX603BLA26663X52AQTM235.01.01200.0800.00
5BL60939930660639006392018-11-30ODLOX78MX8XXFBBBBB6下单DPX784BLA26515X1QBQQ230.01.0598.5899.00
6BL60939930660639006392018-11-30ODLOX78MX8XXFBBBBB6下单DPX784BLA26524X13AQS225.01.0898.5899.00
7BL60939930660009006352018-11-30ODLOX78MX8XXFBBBBB5下单DPX784BLA265Q3X13AQM235.01.01050.0700.00
8BL60930499603999097032018-11-30ODLOXM8BX8XXFBBBBB6下单DPX480BLA67712X15BQTM235.01.0598.5930.00
9BL60930499603999097032018-11-30ODLOXM8BX8XXFBBBBB6下单DPX480BLA26663X5QAQTM235.01.01050.0930.00
00000DPX00X00.00.00.00.0GBL6020
10000DPX00200.00.00.00.0GBL6020
20000DPX00300.00.00.00.0GBL6020
30000DPX00400.00.00.00.0GBL6020
40000DPX00500.00.00.00.0GBL6020
# 用duplicates进行去重
data2['卡号'].drop_duplicates()
0             HS340766JAF6
1    BL6093039999465603590
2    BL6093909593939600407
3    BL6093036096030709394
4    BL6093993066943700650
5    BL6093993066063900639
7    BL6093993066000900635
8    BL6093049960399909703
0                        0
Name: 卡号, dtype: object
  • 1
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值