kelly 0.969361
anne 0.138287
t-c 0.252173
dtype: float64
==========
anne 0.728153
kelly 0.591631
lily 0.017278
dtype: float64
==========
anne 0.866441
kelly 1.560993
lily NaN
t-c NaN
dtype: float64
4.删除和添加
删除
s = pd.Series(np.random.rand(5),index=list('abcde'))print(s)
s1 = s.drop('a')# 返回删除后的值,原值不改变,默认inplace=Falseprint(s1)print(s)
a 0.573110
b 0.110300
c 0.834439
d 0.608614
e 0.796221
dtype: float64
b 0.110300
c 0.834439
d 0.608614
e 0.796221
dtype: float64
a 0.573110
b 0.110300
c 0.834439
d 0.608614
e 0.796221
dtype: float64
data =[1,2,3,4,5]
df = pd.DataFrame(data)print(df)
0
0 1
1 2
2 3
3 4
4 5
df = pd.Series(data)print(df)
0 1
1 2
2 3
3 4
4 5
dtype: int64
#观察可以发现,dataframe多了一个列标签
2.使用嵌套列表创建
#列表中每一个元素代表一行数据
data =[['xiaowang',20],['lily',30],['anne',40]]#未分配列标签
df = pd.DataFrame(data)print(df)#观察可以发现有行标签和列标签
0 1
0 xiaowang 20
1 lily 30
2 anne 40
#列表中每一个元素代表一行数据
data =[['xiaowang',20],['lily',30],['anne',40]]#分配列标签
df = pd.DataFrame(data,columns=['name','age'])print(df)#观察可以发现有行标签和列标签
name age
0 xiaowang 20
1 lily 30
2 anne 40
3.列表嵌套字典创建DataFrame 对象
默认情况下,字典的键被用作列名
data =[{'a':1,'b':2},{'a':5,'b':10,'c':20}]
df = pd.DataFrame(data,index=['first','second'])print(df)
a b c
first 1 2 NaN
second 5 10 20.0
data =[{'a':1,'b':2},{'a':5,'b':10,'c':20}]
df1 = pd.DataFrame(data,index=['first','second'],columns=['a','b'])#注意:因为b1 在字典键中不存在,所以对应值为NaN
df2 = pd.DataFrame(data,index=['first','second'],columns=['a','b1'])print(df1)print('='*20)print(df2)
a b
first 1 2
second 5 10
====================
a b1
first 1 NaN
second 5 NaN
4. 字典嵌套列表创建
data ={'name':['关羽','刘备','张飞','曹操'],'age':[28,34,29,42]}#通过字典创建DataFrame
df = pd.DataFrame(data)print(df)#输入标签print(df.index)print(df.columns)#注意这里使用了默认行标签,也就是RangeIndex
name age
0 关羽 28
1 刘备 34
2 张飞 29
3 曹操 42
RangeIndex(start=0, stop=4, step=1)
Index(['name', 'age'], dtype='object')
5.添加自定义的行标签
data ={'name':['关羽','刘备','张飞','曹操'],'age':[28,34,29,42]}#通过字典创建DataFrame
index =['r1','r2','r3','r4']
df = pd.DataFrame(data,index=index)print(df)
d ={'one':pd.Series([1,2,3],index=list('abc')),'two':pd.Series([1,2,3,4],index=list('abcd'))}
df = pd.DataFrame(d)print(df)
one two
a 1.0 1
b 2.0 2
c 3.0 3
d NaN 4
7.解决不同列设置自定义数据类型
#创建数据
data ={'name':pd.Series(['xiaowang','lily','anne']),'age':pd.Series([20,30,40],dtype=float),'gender':pd.Series(['男','男','女']),'salary':pd.Series([5000,8000,10000],dtype=float)}
df = pd.DataFrame(data)print(df)
name age gender salary
0 xiaowang 20.0 男 5000.0
1 lily 30.0 男 8000.0
2 anne 40.0 女 10000.0
列操作DataFrame
DataFrame 可以使用列标签来完成数据的选取,添加和删除操作。
1.选取列数据
可以使用索引
data ={'name':['关羽','刘备','张飞','曹操'],'age':[28,34,29,42]}#通过字典创建DataFrame
index =['r1','r2','r3','r4']
df = pd.DataFrame(data,index=index)print(df)print('name列')print(df['name'])print('age列')print(df['age'])print('选取多列')print(df[['name','age']])# 不能使用切片选取多列
d ={'one':pd.Series([1,2,3],index=list('abc')),'two':pd.Series([1,2,3,4],index=list('abcd'))}
df = pd.DataFrame(d)#使用df['列']=值,插入新的数据列print('====通过Series添加一个新的列======')
df['three']=pd.Series([10,20,30],index=list('abc'))print(df)#将已经存在的数据列相加运算,从而创建一个新的列print('将已经存在的数据列相加运算,从而创建一个新的列')
df['four']=df['one']+df['three']print(df)
====通过Series添加一个新的列======
one two three
a 1.0 1 10.0
b 2.0 2 20.0
c 3.0 3 30.0
d NaN 4 NaN
将已经存在的数据列相加运算,从而创建一个新的列
one two three four
a 1.0 1 10.0 11.0
b 2.0 2 20.0 22.0
c 3.0 3 30.0 33.0
d NaN 4 NaN NaN
insert() 方法添加
df.insert(loca,column,value,allow_duplocates)
loca 整形,插入索引,必须验证0<=loca<=len(列)
column 插入列的标签,类型可以是(字符串/数字/散列对象)
value 数值 Series或者数组
allow_duplicates 允许重复,可以有相同的列标签数据,默认我False
info =[['王杰',18],['李杰',19],['刘杰',17]]
df = pd.DataFrame(info,columns=['name','age'])print(df)#注意是columns参数#数值1代表插入到columns列表的索引位置
df.insert(2,column='score',value=[91,90,75])print('=====df.insert插入数据======')print(df)
name age
0 王杰 18
1 李杰 19
2 刘杰 17
=====df.insert插入数据======
name age score
0 王杰 18 91
1 李杰 19 90
2 刘杰 17 75
d ={'one':pd.Series([1,2,3],index=list('abc')),'two':pd.Series([1,2,3,4],index=list('abcd')),'three':pd.Series([10,20,30],index=list('abc'))}
df = pd.DataFrame(d)print(df)#使用del删除del df['one']print('====del=====')print(df)#使用pop删除
res_pop = df.pop('two')print('====pop====')# print(df)print(res_pop)#返回值
one two three
a 1.0 1 10.0
b 2.0 2 20.0
c 3.0 3 30.0
d NaN 4 NaN
====del=====
two three
a 1 10.0
b 2 20.0
c 3 30.0
d 4 NaN
====pop====
a 1
b 2
c 3
d 4
Name: two, dtype: int64
行操作 DataFrame
1.loc访问标签选取
行操作需要借助loc属性来完成,按标签或布尔数组访问一组行和列
df.loc[行索引名称或条件,列索引名称]
补充 loc使用的是标签索引,iloc使用的是位置索引,列没有位置索引
d ={'one':pd.Series([1,2,3],index=list('abc')),'two':pd.Series([1,2,3,4],index=list('abcd'))}
df = pd.DataFrame(d)print('====df原始数据=====')print(df)#确定标签为b的数据print('======标签为b的数据====')print(df.loc['b'])
====df原始数据=====
one two
a 1.0 1
b 2.0 2
c 3.0 3
d NaN 4
======标签为b的数据====
one 2.0
two 2.0
Name: b, dtype: float64
b 2.0
c 3.0
d NaN
Name: one, dtype: float64
b 2.0
c 3.0
d NaN
Name: one, dtype: float64
2. iloc访问数值型索引和切片
使用数据型索引,需要使用iloc属性
直接使用索引,会优先查找的是列标签,如果找不到会报错,列没有位置索引
可以使用iloc :行基于整数位置的按位置选择索引
df.iloc[行索引位置,列索引位置]
data ={'name':['关羽','刘备','张飞','曹操'],'age':[28,34,29,42]}#通过字典创建DataFrame
index =['r1','r2','r3','r4']#通过字典创建DataFrame
df = pd.DataFrame(data,index=index)print(df)
C:\Users\DELL\AppData\Local\Temp\ipykernel_21160\1221644449.py:3: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df3 = df.append(d2,ignore_index=True)
C:\Users\DELL\AppData\Local\Temp\ipykernel_21160\3303151293.py:4: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df4 = df.append(df3)
name
age
salary
0
关羽
28
5000.0
1
刘备
34
8000.0
2
张飞
29
4500.0
3
曹操
42
10000.0
a
诸葛亮
30
NaN
2).追加列表
list是一维的,则以列的形式追加
list是二维的,则以行的形式追加
list是三维的,只添加一个值
使用 append可能会出现相同的index,可以使用ignore_index=True 来避免
data =[[1,2,3,4],[5,6,7,8]]
df = pd.DataFrame(data)print(df)
0 1 2 3
0 1 2 3 4.0
1 5 6 7 8.0
0 10 20 30 NaN
使用忽略行索引
0 1 2 3
0 1 2 3 4.0
1 5 6 7 8.0
2 10 20 30 NaN
C:\Users\DELL\AppData\Local\Temp\ipykernel_21160\4112976180.py:2: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df4 = df.append(a_1) #需要添加
C:\Users\DELL\AppData\Local\Temp\ipykernel_21160\4112976180.py:6: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df5 = df.append(a_1,ignore_index=True)
list是一维的
b =[1,2,3]
df5 = df.append(b,ignore_index=True)
df5
C:\Users\DELL\AppData\Local\Temp\ipykernel_21160\3694321706.py:2: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df5 = df.append(b,ignore_index=True)
a b
0 1 2
1 3 4
0 5 6
1 7 8
删除后数据
a b
1 3 4
1 7 8
C:\Users\DELL\AppData\Local\Temp\ipykernel_21160\2006458995.py:5: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df = df.append(df2)
data ={'name':'诸葛亮','age':30}
df1 = df.append(data,ignore_index=True)
df1
C:\Users\DELL\AppData\Local\Temp\ipykernel_21160\3890177474.py:3: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df1 = df.append(data,ignore_index=True)
Name
Age
salary
age
name
0
关羽
28.0
5000
NaN
NaN
1
刘备
34.0
8000
NaN
NaN
2
张飞
29.0
4500
NaN
NaN
3
曹操
42.0
10000
NaN
NaN
4
NaN
NaN
[5000, 8000, 4500, 10000]
[28, 34, 29, 42]
[关羽, 刘备, 张飞, 曹操]
5
NaN
NaN
[5000, 8000, 4500, 10000]
[28, 34, 29, 42]
[关羽, 刘备, 张飞, 曹操]
6
NaN
NaN
NaN
30
诸葛亮
7
NaN
NaN
NaN
30
诸葛亮
8
NaN
NaN
NaN
30
诸葛亮
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 4 non-null object
1 Age 4 non-null float64
2 salary 6 non-null object
3 age 4 non-null object
4 name 4 non-null object
dtypes: float64(1), object(4)
memory usage: 448.0+ bytes
order = pd.read_table('5/meal_order_info.csv',sep=',',encoding='gbk')print('进行转换前订单信息表lock_time类型',order['lock_time'].dtypes)
order['lock_time']= pd.to_datetime(order['lock_time'])print('转换后类型为',order['lock_time'].dtype)
year1 =[i.year for i in order['lock_time']]print('lock_time中的年份的数据前5个信息',year1[:5])
month1 =[i.month for i in order['lock_time']]print('lock_time中的月份的数据前5个信息',month1[:5])
day1 =[i.day for i in order['lock_time']]print('lock_time中的日期的数据前5个信息',day1[:5])
weekday1 =[i.weekday()for i in order['lock_time']]print('lock_time中的星期的数据前5个信息',weekday1[:5])
0 -153 days +11:11:46
1 -153 days +11:31:55
2 -153 days +12:54:37
3 -153 days +13:08:20
4 -153 days +13:07:16
...
940 -123 days +21:31:48
941 -123 days +21:56:12
942 -123 days +21:33:34
943 -123 days +21:55:39
944 -123 days +21:32:56
Name: lock_time, Length: 945, dtype: timedelta64[ns]
任务实现
order = pd.read_table('5/meal_order_info.csv',sep=',',encoding='gbk')#时间字符串转化为标注时间格式
order['use_start_time']= pd.to_datetime(order['use_start_time'])
order['lock_time']= pd.to_datetime(order['lock_time'])print('查看转换后的数据类型',order[['use_start_time','lock_time']].dtypes)#提取菜品数据里面的年月日和星期信息
year =[i.year for i in order['lock_time']]
month =[i.month for i in order['lock_time']]
day =[i.day for i in order['lock_time']]
week =[i.week for i in order['lock_time']]#提取周信息
weekday =[i.weekday()for i in order['lock_time']]#提取星期信息#查询订单信息表的时间统计信息
timemin = order['lock_time'].min()
timemax = order['lock_time'].max()
timem
F:\Anaconda\lib\site-packages\numpy\core\fromnumeric.py:3438: FutureWarning: In a future version, DataFrame.mean(axis=None) will return a scalar mean over the entire DataFrame. To retain the old behavior, use 'frame.mean(axis=0)' or just 'frame.mean()'
return mean(axis=axis, dtype=dtype, out=out, **kwargs)
counts
amounts
order_id
137
1.500000
32.333333
165
1.166667
52.944444
166
1.400000
48.200000
171
1.428571
36.285714
177
1.000000
34.250000
...
...
...
1309
1.153846
34.076923
1314
1.000000
42.333333
1317
1.000000
67.222222
1319
1.000000
67.777778
1323
1.000000
50.933333
278 rows × 2 columns
使用transform方法聚合数据
detailGroup[['counts','amounts']].transform(lambda x : x*2).head(4)
counts
amounts
0
2
98
1
2
96
2
2
60
3
2
50
实现组内离差标准化
(均值-最小值)/(最大值-最小值)
detailGroup.apply(lambda x :( x.mean()-x.min())/(x.max()-x.min())).head()
order_id
counts
amounts
order_id
137
NaN
0.166667
0.319728
165
NaN
0.166667
0.260026
166
NaN
0.400000
0.409709
171
NaN
0.142857
0.477922
177
NaN
NaN
0.570312
任务实现
按时间对菜品订单信息进行拆分
使用agg 方法计算单日菜品销售的平均单价和售价中位数
使用apply方法统计单日菜品销售数目
detail = pd.read_excel('5/meal_order_detail.xlsx')# 将时间转换为时间格式
detail['date']=[i.date()for i in detail['place_order_time']]#按时间对菜品订单进行拆分
detailGroup = detail[['date','counts','amounts']].groupby(by='date')# for i ,j in detailGroup:# display(i,j)#求平均单价和售价中位数
detailGroup['amounts'].agg([np.mean,np.median]).head()#求菜品销售数目
detailGroup['counts'].apply(np.sum)