Pandas - Review


title: Pandas回顾及应用
category: 数据分析基础

Pandas 的应用

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['STFangsong']
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'

Series

创建Series对象

# 1
ser1 = pd.Series([122,98,45,79],index = ['一季度','二季度','三季度','四季度'])  # index(*季度)可重复,但在通过该index索引数据时,得到的是一个新的Series
ser1
一季度    122
二季度     98
三季度     45
四季度     79
dtype: int64
# 2
ser2 = pd.Series({
    '一季度':999,
    '二季度':998,
    '三季度':997,
    '四季度':996,
})
ser2
# 通过字典创建数据系列时,当键(index)重复时,后者覆盖前者
一季度    999
二季度    998
三季度    997
四季度    996
dtype: int64
# 切片
# index切片时左闭右开
ser2[0:2]
一季度    999
二季度    998
dtype: int64
# 通过自己给定的索引切片时,两边均闭合
ser2['一季度':'三季度']
一季度    999
二季度    998
三季度    997
dtype: int64
# 布尔索引
ser1[ser1 > 100]
一季度    122
dtype: int64
# 花式索引
ser1[['一季度','四季度']],ser2[[1,3]]
(一季度    122
 四季度     79
 dtype: int64,
 二季度    998
 四季度    996
 dtype: int64)

Series 的属性 / 方法

# Series的值obj.values
ser2.values
array([999, 998, 997, 996], dtype=int64)
# Series 的index的值 obj.index.values
ser2.index.values
array(['一季度', '二季度', '三季度', '四季度'], dtype=object)
# Series 中的数据是否单调
ser2.is_monotonic
False
# 描述性统计信息
des = ser2.describe()
des
count      4.000000
mean     997.500000
std        1.290994
min      996.000000
25%      996.750000
50%      997.500000
75%      998.250000
max      999.000000
dtype: float64
des['min'],des['25%']
(996.0, 996.75)
ser3 = pd.Series([122,98,45,79,79,122],index = ['一季度','二季度','三季度','四季度','四季度','一季度'])
ser3
一季度    122
二季度     98
三季度     45
四季度     79
四季度     79
一季度    122
dtype: int64
# 独一无二的值
ser3.unique()
array([122,  98,  45,  79], dtype=int64)
# 不重复的值  返回个数
ser3.nunique()
4
# 删除重复值
ser3_ = ser3.drop_duplicates()
ser3_
一季度    122
二季度     98
三季度     45
四季度     79
dtype: int64
# 重复值
ser3.duplicated()
一季度    False
二季度    False
三季度    False
四季度    False
四季度     True
一季度     True
dtype: bool
# 统计频次
ser3.value_counts()
122    2
79     2
98     1
45     1
dtype: int64
ser4 = pd.Series([122,98,np.nan,45,79,np.nan],index = ['一季度','二季度','三季度','四季度','五季度','六季度'])
ser4
一季度    122.0
二季度     98.0
三季度      NaN
四季度     45.0
五季度     79.0
六季度      NaN
dtype: float64
# 是否含有空值
ser4.isnull(),ser4.notnull()
(一季度    False
 二季度    False
 三季度     True
 四季度    False
 五季度    False
 六季度     True
 dtype: bool,
 一季度     True
 二季度     True
 三季度    False
 四季度     True
 五季度     True
 六季度    False
 dtype: bool)
# 删除重复值
ser4.dropna()
一季度    122.0
二季度     98.0
四季度     45.0
五季度     79.0
dtype: float64
# 指定值填充空值
ser4.fillna(100)
一季度    122.0
二季度     98.0
三季度    100.0
四季度     45.0
五季度     79.0
六季度    100.0
dtype: float64
# 将空值填充为平均值
ser4.fillna(ser4.mean())
一季度    122.0
二季度     98.0
三季度     86.0
四季度     45.0
五季度     79.0
六季度     86.0
dtype: float64
# 用空值的前(ffill)/后(bfill)一个数据填充空值
ser4.fillna(method='ffill')
一季度    122.0
二季度     98.0
三季度     98.0
四季度     45.0
五季度     79.0
六季度     79.0
dtype: float64
ser5 = pd.Series(np.arange(1,10))
ser5
0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
8    9
dtype: int32
# 返回满足条件的值以指定值填充不满足条件的值(默认填充为空值)
ser5.where(ser5<5),ser5.where(ser5<5,99)
(0    1.0
 1    2.0
 2    3.0
 3    4.0
 4    NaN
 5    NaN
 6    NaN
 7    NaN
 8    NaN
 dtype: float64,
 0     1
 1     2
 2     3
 3     4
 4    99
 5    99
 6    99
 7    99
 8    99
 dtype: int32)
# 返回不满足条件的值以指定值填充满足条件的值(默认填充为空值)
ser5.mask(ser5<5),ser5.mask(ser5<5,99)
(0    NaN
 1    NaN
 2    NaN
 3    NaN
 4    5.0
 5    6.0
 6    7.0
 7    8.0
 8    9.0
 dtype: float64,
 0    99
 1    99
 2    99
 3    99
 4     5
 5     6
 6     7
 7     8
 8     9
 dtype: int32)
ser6 = pd.Series(np.random.randint(30,80,50))
ser6
0     72
1     33
2     44
3     30
4     44
5     78
6     56
7     56
8     55
9     72
10    70
11    74
12    41
13    74
14    55
15    76
16    54
17    66
18    63
19    62
20    54
21    32
22    46
23    52
24    57
25    54
26    43
27    64
28    77
29    62
30    46
31    53
32    59
33    72
34    50
35    73
36    58
37    77
38    72
39    49
40    73
41    65
42    74
43    30
44    32
45    79
46    40
47    76
48    41
49    72
dtype: int32
ser6.map(lambda x: round((x ** 0.5 * 10),0))
0     85.0
1     57.0
2     66.0
3     55.0
4     66.0
5     88.0
6     75.0
7     75.0
8     74.0
9     85.0
10    84.0
11    86.0
12    64.0
13    86.0
14    74.0
15    87.0
16    73.0
17    81.0
18    79.0
19    79.0
20    73.0
21    57.0
22    68.0
23    72.0
24    75.0
25    73.0
26    66.0
27    80.0
28    88.0
29    79.0
30    68.0
31    73.0
32    77.0
33    85.0
34    71.0
35    85.0
36    76.0
37    88.0
38    85.0
39    70.0
40    85.0
41    81.0
42    86.0
43    55.0
44    57.0
45    89.0
46    63.0
47    87.0
48    64.0
49    85.0
dtype: float64
# 排序与头部值
ser7 = pd.Series([99,54,78,35,86],['apple', 'banana', 'pitaya', 'pitaya', 'durian'])
ser7
apple     99
banana    54
pitaya    78
pitaya    35
durian    86
dtype: int64
ser7.sort_values(inplace=True)
ser7
pitaya    35
banana    54
pitaya    78
durian    86
apple     99
dtype: int64
ser7.sort_index(inplace = True)
ser7
apple     99
banana    54
durian    86
pitaya    35
pitaya    78
dtype: int64
ser7.nlargest(3),ser7.nsmallest(3)
(apple     99
 durian    86
 pitaya    78
 dtype: int64,
 pitaya    35
 banana    54
 pitaya    78
 dtype: int64)
ser1.plot(figsize = (8,4), kind = 'bar', width = 0.3)   # 水平柱状图:barh
plt.grid(True,alpha = 0.3,axis = 'y', linestyle = '--')   # 设置网格线便于观察。alpha设置透明度、axis设置需要添加网格的轴
plt.xticks(rotation = 0)   # 横坐标标签水平显示
plt.yticks(np.arange(0,151,30))   # 订制纵坐标
for i in range(ser1.size):
    plt.text(i,ser1[i] + 1,ser1[i],ha = 'center')
plt.show()
    
ser1.plot(kind = 'pie',autopct = '%.2f%%',wedgeprops = dict(width = 0.4,edgecolor = 'white'),pctdistance = 0.8)
# wedgeprops = dict(width = 0.4,edgecolor = 'white'),pctdistance = 0.8
# width:环状图宽度;edgecolor:边界颜色;pctdistance:百分比显示距离(距圆心)
plt.ylabel('')
plt.title('2021各季度销售占比')
plt.show()

DataFrame

# 创建dataframe
scores = np.random.randint(60, 101, (5, 3))
courses = ['语文', '数学', '英语']
ids = [1001, 1002, 1003, 1004, 1005]
df1 = pd.DataFrame(data=scores, columns=courses, index=ids)
df1
语文数学英语
1001696282
1002758060
1003716770
1004909665
1005668993
scores = {
    '语文': [62, 72, 93, 88, 93],
    '数学': [95, 65, 86, 66, 87],
    '英语': [66, 75, 82, 69, 82],
}
ids = [1001, 1002, 1003, 1004, 1005]
df2 = pd.DataFrame(data=scores, index=ids)
df2
语文数学英语
1001629566
1002726575
1003938682
1004886669
1005938782
df3 = pd.read_csv('../files/data/2018年北京积分落户数据.csv',
                  index_col='id',
#                  quotechar='包围内容的字符(双引号自动去)',
                 )
df3.head()
namebirthdaycompanyscore
id
1杨效丰1972-12北京利德华福电气技术有限公司122.59
2纪丰伟1974-12北京航天数据股份有限公司121.25
3王永1974-05品牌联盟(北京)咨询股份公司118.96
4杨静1975-07中科专利商标代理有限责任公司118.21
5张凯江1974-11北京阿里巴巴云计算技术有限公司117.79
df4 = pd.read_csv('../files/data/bilibili.csv',encoding='GBK')
df4.head()
titleurlwatchnumdmuptimeupname
0阿里云大学课程(云计算、中间件、大数据、云安全)//www.bilibili.com/video/BV1Lv411s7wu?from=search295442021/1/21韭菜滚雪球
1视觉传达设计专业的小朋友大学课程有哪些,强度怎么样,需要什么技能?学姐给新生的解答与建议//www.bilibili.com/video/BV1Ea4y1a7CX?from=search352682020/7/25铧仔仔儿的奋斗史
2CAP:适合高中生的大学课程(上大学之前提前学习大学的课程)同济大学《微积分CAP》//www.bilibili.com/video/BV1X4411Y7u8?from=search5597172019/5/11愚甘杂货铺
3干货!论文读写系列|写作风格:例文解析(1)|人文社科|教育学|大学课程、知识//www.bilibili.com/video/BV1VC4y1b7ZA?from=search1.1万292020/7/26cici西西熙熙
4《用户体验与心理-第1期》大学课程//www.bilibili.com/video/BV1r7411M7gY?from=search137392020/2/24Luka老师
# 以制表符为分隔符的文件
df5 = pd.read_csv('../files/data/chipotle.tsv',delimiter='\t')
df5.head()
order_idquantityitem_namechoice_descriptionitem_price
011Chips and Fresh Tomato SalsaNaN$2.39
111Izze[Clementine]$3.39
211Nantucket Nectar[Apple]$3.39
311Chips and Tomatillo-Green Chili SalsaNaN$2.39
422Chicken Bowl[Tomatillo-Red Chili Salsa (Hot), [Black Beans...$16.98
df6 = pd.read_excel('../files/data/2020年销售数据.xlsx',header=1,sheet_name = 'Sheet1')
# usecols = [] - 指定列; nrows = 100 - 只读100行; skiprows = np.arange(2,102) - 去跳过前100行
df6.head()
销售日期销售信息销售区域销售渠道销售订单品牌售价销售数量销售额
02020-01-01上海拼多多订单182894-455上海拼多多182894-455八匹马99838217
12020-01-01上海抖音订单205635-402上海抖音205635-402八匹马219296351
22020-01-01上海天猫订单205654-021上海天猫205654-021八匹马1698514365
32020-01-01上海天猫订单205654-519上海天猫205654-519八匹马169142366
42020-01-01上海天猫订单377781-010上海天猫377781-010皮皮虾2496115189
df7 = pd.read_excel('../files/data/口罩销售数据.xlsx')
df7.head()
日期销售城市产品类别销售员数量单价金额
02020-01-01广州纯棉口罩王大锤616
12020-01-01武汉明星口罩秦红棉3030900
22020-01-01深圳明星口罩秦红棉23060
32020-01-03上海防雾霾口罩白元芳63281764
42020-01-04武汉明星口罩白元芳2530750
df8 = pd.read_excel('../files/data/某视频网站运营数据.xlsx')
df8.head()
video_idtitlechannel_titletagsviewslikesdislikescomment_count
02kyS6SvSYSEWE WANT TO TALK ABOUT OUR MARRIAGECaseyNeistatSHANtell martin74837457527296615954
11ZAPwfrtAFYThe Trump Presidency: Last Week Tonight with J...LastWeekTonightlast week tonight trump presidency|"last week ...241878397185614612703
25qpjK5DgCt4Racist Superman | Rudy Mancuso, King Bach & Le...Rudy Mancusoracist superman|"rudy"|"mancuso"|"king"|"bach"...319143414603353398181
3puqaWrEC7tYNickelback Lyrics: Real or Fake?Good Mythical Morningrhett and link|"gmm"|"good mythical morning"|"...343168101726662146
4d380meD0W0MI Dare You: GOING BALD!?nigahigaryan|"higa"|"higatv"|"nigahiga"|"i dare you"|"...2095731132235198917518
# 连接MYSQL数据库创建DataFrame
import pymysql

conn = pymysql.connect(host = '47.104.31.138',port = 3306,
                      user = 'guest',password = 'Guest.618',
                      database = 'hrs',charset = 'utf8mb4')
conn
<pymysql.connections.Connection at 0x1b71d968070>
df9 = pd.read_sql('select dno,dname,dloc from tb_dept',conn,index_col = 'dno')
df9
dnamedloc
dno
10会计部北京
20研发部成都
30销售部重庆
40运维部天津
df10 = pd.read_sql('select eno,ename,job,sal,comm,dno from tb_emp',conn,index_col = 'eno')
df10
enamejobsalcommdno
eno
1359胡一刀销售员1800200.030
2056乔峰分析师50001500.020
3088李莫愁设计师3500800.020
3211张无忌程序员3200NaN20
3233丘处机程序员3400NaN20
3244欧阳锋程序员3200NaN20
3251张翠山程序员4000NaN20
3344黄蓉销售主管3000800.030
3577杨过会计2200NaN10
3588朱九真会计2500NaN10
4466苗人凤销售员2500NaN30
5234郭靖出纳2000NaN10
5566宋远桥会计师40001000.010
7800张三丰总裁90001200.020
df10['ename'].map(str.strip).apply(lambda x: x[0] + '*'*(len(x) - 1))
eno
1359    胡**
2056     乔*
3088    李**
3211    张**
3233    丘**
3244    欧**
3251    张**
3344     黄*
3577     杨*
3588    朱**
4466    苗**
5234     郭*
5566    宋**
7800    张**
Name: ename, dtype: object
df10.ename
eno
1359    胡一刀 
2056      乔峰
3088     李莫愁
3211     张无忌
3233     丘处机
3244     欧阳锋
3251     张翠山
3344      黄蓉
3577      杨过
3588     朱九真
4466     苗人凤
5234      郭靖
5566     宋远桥
7800     张三丰
Name: ename, dtype: object
df10[['ename','sal']]
enamesal
eno
1359胡一刀1800
2056乔峰5000
3088李莫愁3500
3211张无忌3200
3233丘处机3400
3244欧阳锋3200
3251张翠山4000
3344黄蓉3000
3577杨过2200
3588朱九真2500
4466苗人凤2500
5234郭靖2000
5566宋远桥4000
7800张三丰9000
# 行索引
df10.loc[2056],df10.iloc[1]
# df.loc[自己设置的索引],df.iloc[系统index]
(ename        乔峰
 job         分析师
 sal        5000
 comm     1500.0
 dno          20
 Name: 2056, dtype: object,
 ename        乔峰
 job         分析师
 sal        5000
 comm     1500.0
 dno          20
 Name: 2056, dtype: object)
# 花式索引
df10.loc[[2056,5566]]
enamejobsalcommdno
eno
2056乔峰分析师50001500.020
5566宋远桥会计师40001000.010
# 添加列
df10['married'] = ['未婚'] * 5 + ['已婚'] * 9
df10
enamejobsalcommdnomarried
eno
1359胡一刀销售员1800200.030未婚
2056乔峰分析师50001500.020未婚
3088李莫愁设计师3500800.020未婚
3211张无忌程序员3200NaN20未婚
3233丘处机程序员3400NaN20未婚
3244欧阳锋程序员3200NaN20已婚
3251张翠山程序员4000NaN20已婚
3344黄蓉销售主管3000800.030已婚
3577杨过会计2200NaN10已婚
3588朱九真会计2500NaN10已婚
4466苗人凤销售员2500NaN30已婚
5234郭靖出纳2000NaN10已婚
5566宋远桥会计师40001000.010已婚
7800张三丰总裁90001200.020已婚
# 添加行
df10.loc[9800] = ['Jack','架构师',12000,2000,20,'未婚']
df10
enamejobsalcommdnomarried
eno
1359胡一刀销售员1800200.030未婚
2056乔峰分析师50001500.020未婚
3088李莫愁设计师3500800.020未婚
3211张无忌程序员3200NaN20未婚
3233丘处机程序员3400NaN20未婚
3244欧阳锋程序员3200NaN20已婚
3251张翠山程序员4000NaN20已婚
3344黄蓉销售主管3000800.030已婚
3577杨过会计2200NaN10已婚
3588朱九真会计2500NaN10已婚
4466苗人凤销售员2500NaN30已婚
5234郭靖出纳2000NaN10已婚
5566宋远桥会计师40001000.010已婚
7800张三丰总裁90001200.020已婚
9800Jack架构师120002000.020未婚
# 获取单元格值
df10.at[3088,'job'],df10.iat[2,1]
# at[自己设置的行索引,'列名']   iat[列索引,行索引]
('设计师', '设计师')
# 删除列
df10.drop(columns=['married','comm'],inplace = True)
df10
enamejobsaldno
eno
1359胡一刀销售员180030
2056乔峰分析师500020
3088李莫愁设计师350020
3211张无忌程序员320020
3233丘处机程序员340020
3244欧阳锋程序员320020
3251张翠山程序员400020
3344黄蓉销售主管300030
3577杨过会计220010
3588朱九真会计250010
4466苗人凤销售员250030
5234郭靖出纳200010
5566宋远桥会计师400010
7800张三丰总裁900020
9800Jack架构师1200020
# 删除行
df10.drop(index = [1359,3233,3088])
enamejobsaldno
eno
2056乔峰分析师500020
3211张无忌程序员320020
3244欧阳锋程序员320020
3251张翠山程序员400020
3344黄蓉销售主管300030
3577杨过会计220010
3588朱九真会计250010
4466苗人凤销售员250030
5234郭靖出纳200010
5566宋远桥会计师400010
7800张三丰总裁900020
9800Jack架构师1200020
# 重置索引
df10.reset_index(inplace = True)
df10
enoenamejobsaldno
01359胡一刀销售员180030
12056乔峰分析师500020
23088李莫愁设计师350020
33211张无忌程序员320020
43233丘处机程序员340020
53244欧阳锋程序员320020
63251张翠山程序员400020
73344黄蓉销售主管300030
83577杨过会计220010
93588朱九真会计250010
104466苗人凤销售员250030
115234郭靖出纳200010
125566宋远桥会计师400010
137800张三丰总裁900020
149800Jack架构师1200020
# 设置某列作为索引
# df10.set_index('eno')
# 设置多列为索引(多级索引)
df10.set_index(['ename','eno'])
jobsaldno
enameeno
胡一刀1359销售员180030
乔峰2056分析师500020
李莫愁3088设计师350020
张无忌3211程序员320020
丘处机3233程序员340020
欧阳锋3244程序员320020
张翠山3251程序员400020
黄蓉3344销售主管300030
杨过3577会计220010
朱九真3588会计250010
苗人凤4466销售员250030
郭靖5234出纳200010
宋远桥5566会计师400010
张三丰7800总裁900020
Jack9800架构师1200020
# 更改列名
df10.rename(columns={'eno':'编号','ename':'姓名','job':'职位','sal':'薪水','dno':'部门'})
编号姓名职位薪水部门
01359胡一刀销售员180030
12056乔峰分析师500020
23088李莫愁设计师350020
33211张无忌程序员320020
43233丘处机程序员340020
53244欧阳锋程序员320020
63251张翠山程序员400020
73344黄蓉销售主管300030
83577杨过会计220010
93588朱九真会计250010
104466苗人凤销售员250030
115234郭靖出纳200010
125566宋远桥会计师400010
137800张三丰总裁900020
149800Jack架构师1200020
# 调整列顺序
df10.reindex(columns=['ename','eno','sal','dno','job'])
enameenosaldnojob
0胡一刀1359180030销售员
1乔峰2056500020分析师
2李莫愁3088350020设计师
3张无忌3211320020程序员
4丘处机3233340020程序员
5欧阳锋3244320020程序员
6张翠山3251400020程序员
7黄蓉3344300030销售主管
8杨过3577220010会计
9朱九真3588250010会计
10苗人凤4466250030销售员
11郭靖5234200010出纳
12宋远桥5566400010会计师
13张三丰7800900020总裁
14Jack98001200020架构师
pd.merge(left = df10,right = df9,how = 'inner',on = 'dno')
enoenamejobsaldnodnamedloc
01359胡一刀销售员180030销售部重庆
13344黄蓉销售主管300030销售部重庆
24466苗人凤销售员250030销售部重庆
32056乔峰分析师500020研发部成都
43088李莫愁设计师350020研发部成都
53211张无忌程序员320020研发部成都
63233丘处机程序员340020研发部成都
73244欧阳锋程序员320020研发部成都
83251张翠山程序员400020研发部成都
97800张三丰总裁900020研发部成都
109800Jack架构师1200020研发部成都
113577杨过会计220010会计部北京
123588朱九真会计250010会计部北京
135234郭靖出纳200010会计部北京
145566宋远桥会计师400010会计部北京
df9.reset_index(inplace = True)
df9.rename(columns = {'dno':'dept_no'},inplace = True)
df9
dept_nodnamedloc
010会计部北京
120研发部成都
230销售部重庆
340运维部天津
# 合并两张表(表结构不同) - merge()
pd.merge(left = df10,right = df9,how = 'inner',left_on = 'dno',right_on = 'dept_no').drop(columns = 'dno')
enoenamejobsaldept_nodnamedloc
01359胡一刀销售员180030销售部重庆
13344黄蓉销售主管300030销售部重庆
24466苗人凤销售员250030销售部重庆
32056乔峰分析师500020研发部成都
43088李莫愁设计师350020研发部成都
53211张无忌程序员320020研发部成都
63233丘处机程序员340020研发部成都
73244欧阳锋程序员320020研发部成都
83251张翠山程序员400020研发部成都
97800张三丰总裁900020研发部成都
109800Jack架构师1200020研发部成都
113577杨过会计220010会计部北京
123588朱九真会计250010会计部北京
135234郭靖出纳200010会计部北京
145566宋远桥会计师400010会计部北京
df11 = pd.read_sql('select eno,ename,job,sal,comm,dno from tb_emp',conn,index_col = 'eno')
df12 = pd.read_sql('select eno,ename,job,sal,comm,dno from tb_emp2',conn,index_col = 'eno')
df12
enamejobsalcommdno
eno
9800骆昊架构师30000500020
9900王小刀程序员10000120020
9700王大锤程序员800060020
# 拼接表结构相同的表 - concat()
pd.concat((df11,df12))
enamejobsalcommdno
eno
1359胡一刀销售员1800200.030
2056乔峰分析师50001500.020
3088李莫愁设计师3500800.020
3211张无忌程序员3200NaN20
3233丘处机程序员3400NaN20
3244欧阳锋程序员3200NaN20
3251张翠山程序员4000NaN20
3344黄蓉销售主管3000800.030
3577杨过会计2200NaN10
3588朱九真会计2500NaN10
4466苗人凤销售员2500NaN30
5234郭靖出纳2000NaN10
5566宋远桥会计师40001000.010
7800张三丰总裁90001200.020
9800骆昊架构师300005000.020
9900王小刀程序员100001200.020
9700王大锤程序员8000600.020
# 读取多个表结构相同文件拼接
import os

filenames = os.listdir('../files/data/datas')
dfs = [pd.read_excel(os.path.join('../files/data/datas',filename),header = 1) for filename in filenames]
pd.concat(dfs,ignore_index=True)   # ignore_index=True - 忽略表头
# pd.concat(dfs,ignore_index=True).to_excel('汇总数据.xlsx',index = False)
购药时间社保卡号商品编码商品名称销售数量应收金额实收金额
02018-01-01 星期五1616528236701强力VC银翘片682.869.00
12018-01-02 星期六1616528236701清热解毒口服液128.024.64
22018-01-06 星期三12602828236701感康216.815.00
32018-01-11 星期一10070343428236701三九感冒灵128.028.00
42018-01-15 星期五101554328236701三九感冒灵8224.0208.00
52018-01-20 星期三13389528236701三九感冒灵128.028.00
62018-01-31 星期日101464928236701三九感冒灵256.056.00
72018-02-17 星期三11177328236701三九感冒灵5149.0131.12
82018-02-22 星期一10065687828236701三九感冒灵129.826.22
92018-04-07 星期四11652628236701清热解毒口服液6168.0140.00
102018-04-13 星期三11005128236701清热解毒口服液256.056.00
112018-04-22 星期五10344628236701清热解毒口服液6168.0140.00
122018-05-01 星期日10070313828236701清热解毒口服液6168.0140.00
132018-05-05 星期四10031328528236701清热解毒口服液5140.0112.00
142018-05-05 星期四10070343428236701清热解毒口服液256.049.28
152018-05-05 星期四10073660228236701清热解毒口服液128.024.64
162018-03-05 星期六10077400828236701清热解毒口服液128.024.64
172018-03-07 星期一10077400828236701清热解毒口服液5140.0112.00
182018-03-09 星期三10079843728236701清热解毒口服液6168.0140.00
192018-03-15 星期二10031328528236701清热解毒口服液256.049.28
202018-03-15 星期二100703428236701清热解毒口服液256.049.28
212018-03-15 星期二10712328236701清热解毒口服液5140.0112.00
222018-03-20 星期日11668828236701清热解毒口服液6168.0140.00
232018-03-22 星期二10066351928236701清热解毒口服液128.028.00
df11[df11.dno == 20]
enamejobsalcommdno
eno
2056乔峰分析师50001500.020
3088李莫愁设计师3500800.020
3211张无忌程序员3200NaN20
3233丘处机程序员3400NaN20
3244欧阳锋程序员3200NaN20
3251张翠山程序员4000NaN20
7800张三丰总裁90001200.020
# 条件筛选
df11[(df11.dno == 20) & (df11.sal >= 4000)]
enamejobsalcommdno
eno
2056乔峰分析师50001500.020
3251张翠山程序员4000NaN20
7800张三丰总裁90001200.020
# 条件筛选
df11.query('dno == 20 and sal >= 4000')
enamejobsalcommdno
eno
2056乔峰分析师50001500.020
3251张翠山程序员4000NaN20
7800张三丰总裁90001200.020
np.sum(df11[(df11.dno == 20) & (df11.sal >= 4000)]['sal'])
18000
df11.dno.unique()
array([30, 20, 10], dtype=int64)
df9 = pd.read_sql('select dno,dname,dloc from tb_dept',conn,index_col = 'dno')
df9.loc[50] = ['会计部','北京']
df9.loc[60] = ['研发部','成都']
df9
dnamedloc
dno
10会计部北京
20研发部成都
30销售部重庆
40运维部天津
50会计部北京
60研发部成都
df9.duplicated('dname')
dno
10    False
20    False
30    False
40    False
50     True
60     True
dtype: bool
df9.drop_duplicates()
dnamedloc
dno
10会计部北京
20研发部成都
30销售部重庆
40运维部天津
df12 = pd.read_excel('../files/data/某视频网站运营数据.xlsx')
df12.head()
video_idtitlechannel_titletagsviewslikesdislikescomment_count
02kyS6SvSYSEWE WANT TO TALK ABOUT OUR MARRIAGECaseyNeistatSHANtell martin74837457527296615954
11ZAPwfrtAFYThe Trump Presidency: Last Week Tonight with J...LastWeekTonightlast week tonight trump presidency|"last week ...241878397185614612703
25qpjK5DgCt4Racist Superman | Rudy Mancuso, King Bach & Le...Rudy Mancusoracist superman|"rudy"|"mancuso"|"king"|"bach"...319143414603353398181
3puqaWrEC7tYNickelback Lyrics: Real or Fake?Good Mythical Morningrhett and link|"gmm"|"good mythical morning"|"...343168101726662146
4d380meD0W0MI Dare You: GOING BALD!?nigahigaryan|"higa"|"higatv"|"nigahiga"|"i dare you"|"...2095731132235198917518
df12.nunique()
video_id          6351
title             6453
channel_title     2207
tags              6055
views            40478
likes            29850
dislikes          8516
comment_count    13773
dtype: int64
df12.drop_duplicates('video_id').shape
(6351, 8)
a = np.random.normal(110,5,50)
a = np.ceil(np.insert(a,-1,198))
a
array([112., 115., 103., 112., 108., 111., 113., 113., 110., 120., 112.,
       117., 106., 109., 110., 105., 115., 116., 108., 109., 120., 108.,
       117., 110., 110., 113., 107., 111., 114., 127., 117., 105.,  97.,
       105., 116., 108., 115., 124., 120., 100., 111., 117., 110., 112.,
       108., 113., 108., 116., 109., 198., 110.])
plt.boxplot(a)
plt.show()
# 查找异常值
def detect_outliers_iqr(data,whis = 1):
    q1 = np.quantile(data,0.25)
    q3 = np.quantile(data,0.75)
    iqr = q3 - q1
    lower = q1 - 3 * iqr
    upper = q3 + 3 * iqr
    return data[(data <= lower) | (data >= upper)]
detect_outliers_iqr(a)
array([198.])
def detect_outliers_zscore(data, threshold=3):
    avg_value = np.mean(data)
    std_value = np.std(data)
    z_score = np.abs((data - avg_value) / std_value)
    return data[z_score > threshold]
b = detect_outliers_zscore(a)
a = pd.Series(a)
a
0     112.0
1     115.0
2     103.0
3     112.0
4     108.0
5     111.0
6     113.0
7     113.0
8     110.0
9     120.0
10    112.0
11    117.0
12    106.0
13    109.0
14    110.0
15    105.0
16    115.0
17    116.0
18    108.0
19    109.0
20    120.0
21    108.0
22    117.0
23    110.0
24    110.0
25    113.0
26    107.0
27    111.0
28    114.0
29    127.0
30    117.0
31    105.0
32     97.0
33    105.0
34    116.0
35    108.0
36    115.0
37    124.0
38    120.0
39    100.0
40    111.0
41    117.0
42    110.0
43    112.0
44    108.0
45    113.0
46    108.0
47    116.0
48    109.0
49    198.0
50    110.0
dtype: float64
a.replace(198,110)
0     112.0
1     115.0
2     103.0
3     112.0
4     108.0
5     111.0
6     113.0
7     113.0
8     110.0
9     120.0
10    112.0
11    117.0
12    106.0
13    109.0
14    110.0
15    105.0
16    115.0
17    116.0
18    108.0
19    109.0
20    120.0
21    108.0
22    117.0
23    110.0
24    110.0
25    113.0
26    107.0
27    111.0
28    114.0
29    127.0
30    117.0
31    105.0
32     97.0
33    105.0
34    116.0
35    108.0
36    115.0
37    124.0
38    120.0
39    100.0
40    111.0
41    117.0
42    110.0
43    112.0
44    108.0
45    113.0
46    108.0
47    116.0
48    109.0
49    110.0
50    110.0
dtype: float64
df13 = pd.read_csv('../files/data/bilibili.csv',encoding='GBK')
df13
titleurlwatchnumdmuptimeupname
0阿里云大学课程(云计算、中间件、大数据、云安全)//www.bilibili.com/video/BV1Lv411s7wu?from=search295442021/1/21韭菜滚雪球
1视觉传达设计专业的小朋友大学课程有哪些,强度怎么样,需要什么技能?学姐给新生的解答与建议//www.bilibili.com/video/BV1Ea4y1a7CX?from=search352682020/7/25铧仔仔儿的奋斗史
2CAP:适合高中生的大学课程(上大学之前提前学习大学的课程)同济大学《微积分CAP》//www.bilibili.com/video/BV1X4411Y7u8?from=search5597172019/5/11愚甘杂货铺
3干货!论文读写系列|写作风格:例文解析(1)|人文社科|教育学|大学课程、知识//www.bilibili.com/video/BV1VC4y1b7ZA?from=search1.1万292020/7/26cici西西熙熙
4《用户体验与心理-第1期》大学课程//www.bilibili.com/video/BV1r7411M7gY?from=search137392020/2/24Luka老师
.....................
1902【中国石油大学】构造地质学(第1-2章) ——主讲:李理老师//www.bilibili.com/video/BV1Y4411x7sE?from=search409752019/5/2點點滴滴Super
1903【模电】模拟电子技术基础【四】//www.bilibili.com/video/BV1sb411v7E6?from=search5997112019/3/8干脆的非洲鼓
1904【模电】模拟电子技术基础【三】//www.bilibili.com/video/BV1Rb411i7yo?from=search471382019/3/5干脆的非洲鼓
1905结构化学-东北大学-王军//www.bilibili.com/video/BV1yb411t7Mb?from=search583522019/2/28万万万万不能怂
1906《结构化学》南开大学-孙宏伟 重点难点解析//www.bilibili.com/video/BV1yt411t7UG?from=search528762019/1/28阿bu爱学习

1907 rows × 6 columns

from urllib.parse import urljoin

def fix_url(url):
    return urljoin('https:',url[:url.rfind('?')])
df13['url'] = df13['url'].apply(fix_url)
df13
titleurlwatchnumdmuptimeupname
0阿里云大学课程(云计算、中间件、大数据、云安全)https://www.bilibili.com/video/BV1Lv411s7wu295442021/1/21韭菜滚雪球
1视觉传达设计专业的小朋友大学课程有哪些,强度怎么样,需要什么技能?学姐给新生的解答与建议https://www.bilibili.com/video/BV1Ea4y1a7CX352682020/7/25铧仔仔儿的奋斗史
2CAP:适合高中生的大学课程(上大学之前提前学习大学的课程)同济大学《微积分CAP》https://www.bilibili.com/video/BV1X4411Y7u85597172019/5/11愚甘杂货铺
3干货!论文读写系列|写作风格:例文解析(1)|人文社科|教育学|大学课程、知识https://www.bilibili.com/video/BV1VC4y1b7ZA1.1万292020/7/26cici西西熙熙
4《用户体验与心理-第1期》大学课程https://www.bilibili.com/video/BV1r7411M7gY137392020/2/24Luka老师
.....................
1902【中国石油大学】构造地质学(第1-2章) ——主讲:李理老师https://www.bilibili.com/video/BV1Y4411x7sE409752019/5/2點點滴滴Super
1903【模电】模拟电子技术基础【四】https://www.bilibili.com/video/BV1sb411v7E65997112019/3/8干脆的非洲鼓
1904【模电】模拟电子技术基础【三】https://www.bilibili.com/video/BV1Rb411i7yo471382019/3/5干脆的非洲鼓
1905结构化学-东北大学-王军https://www.bilibili.com/video/BV1yb411t7Mb583522019/2/28万万万万不能怂
1906《结构化学》南开大学-孙宏伟 重点难点解析https://www.bilibili.com/video/BV1yt411t7UG528762019/1/28阿bu爱学习

1907 rows × 6 columns

def handle_watchnum(watchnum):
    unit_dict = {'万':10000,'亿':100000000}
    unit = unit_dict.get(watchnum[-1],1)
    if watchnum[-1] in unit_dict:
        watchnum = watchnum[:-1]
    return int(float(watchnum) * unit)
df13['watchnum'] = df13['watchnum'].apply(handle_watchnum)
df13
titleurlwatchnumdmuptimeupname
0阿里云大学课程(云计算、中间件、大数据、云安全)https://www.bilibili.com/video/BV1Lv411s7wu295442021/1/21韭菜滚雪球
1视觉传达设计专业的小朋友大学课程有哪些,强度怎么样,需要什么技能?学姐给新生的解答与建议https://www.bilibili.com/video/BV1Ea4y1a7CX352682020/7/25铧仔仔儿的奋斗史
2CAP:适合高中生的大学课程(上大学之前提前学习大学的课程)同济大学《微积分CAP》https://www.bilibili.com/video/BV1X4411Y7u85597172019/5/11愚甘杂货铺
3干货!论文读写系列|写作风格:例文解析(1)|人文社科|教育学|大学课程、知识https://www.bilibili.com/video/BV1VC4y1b7ZA11000292020/7/26cici西西熙熙
4《用户体验与心理-第1期》大学课程https://www.bilibili.com/video/BV1r7411M7gY137392020/2/24Luka老师
.....................
1902【中国石油大学】构造地质学(第1-2章) ——主讲:李理老师https://www.bilibili.com/video/BV1Y4411x7sE409752019/5/2點點滴滴Super
1903【模电】模拟电子技术基础【四】https://www.bilibili.com/video/BV1sb411v7E65997112019/3/8干脆的非洲鼓
1904【模电】模拟电子技术基础【三】https://www.bilibili.com/video/BV1Rb411i7yo471382019/3/5干脆的非洲鼓
1905结构化学-东北大学-王军https://www.bilibili.com/video/BV1yb411t7Mb583522019/2/28万万万万不能怂
1906《结构化学》南开大学-孙宏伟 重点难点解析https://www.bilibili.com/video/BV1yt411t7UG528762019/1/28阿bu爱学习

1907 rows × 6 columns

df13['uptime'] = pd.to_datetime(df13.uptime)
df13.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1907 entries, 0 to 1906
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   title     1907 non-null   object        
 1   url       1907 non-null   object        
 2   watchnum  1907 non-null   int64         
 3   dm        1907 non-null   object        
 4   uptime    1907 non-null   datetime64[ns]
 5   upname    1907 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 89.5+ KB
df13.uptime.dt.year
0       2021
1       2020
2       2019
3       2020
4       2020
        ... 
1902    2019
1903    2019
1904    2019
1905    2019
1906    2019
Name: uptime, Length: 1907, dtype: int64

Pandas 对象的其他方法

复合数据类型拆分
df_example = pd.DataFrame({
    "col1":[[1,2,3], [4,5,6]],
    "col2":[[1] * 3, [9] * 2]
})
df_example
col1col2
0[1, 2, 3][1, 1, 1]
1[4, 5, 6][9, 9]
df_example.explode(column='col1')
col1col2
01[1, 1, 1]
02[1, 1, 1]
03[1, 1, 1]
14[9, 9]
15[9, 9]
16[9, 9]
df_example['col2'].explode()
0    1
0    1
0    1
1    9
1    9
Name: col2, dtype: object
窗口计算
ser_example = pd.Series(np.random.randint(10,100,10))
ser_example
0    48
1    80
2    77
3    66
4    59
5    72
6    71
7    97
8    86
9    33
dtype: int32
# rolling(N).聚合函数 - 前N个元素做运算(滚动步长为1)
ser_example.rolling(3).sum(),ser_example.rolling(5).mean()
(0      NaN
 1      NaN
 2    205.0
 3    223.0
 4    202.0
 5    197.0
 6    202.0
 7    240.0
 8    254.0
 9    216.0
 dtype: float64,
 0     NaN
 1     NaN
 2     NaN
 3     NaN
 4    66.0
 5    70.8
 6    69.0
 7    73.0
 8    77.0
 9    71.8
 dtype: float64)
计算协方差 - 用于衡量两个变量的总体误差

协方差

协方差(covariance):用于衡量两个随机变量的联合变化程度。如果变量 X X X的较大值主要与另一个变量 Y Y Y的较大值相对应,而两者较小值也相对应,那么两个变量倾向于表现出相似的行为,协方差为正。如果一个变量的较大值主要对应于另一个变量的较小值,则两个变量倾向于表现出相反的行为,协方差为负。简单的说,协方差的正负号显示着两个变量的相关性。方差是协方差的一种特殊情况,即变量与自身的协方差。

c o v ( X , Y ) = E ( ( X − μ ) ( Y − υ ) ) = E ( X ⋅ Y ) − μ υ cov(X,Y) = E((X - \mu)(Y - \upsilon)) = E(X \cdot Y) - \mu\upsilon cov(X,Y)=E((Xμ)(Yυ))=E(XY)μυ

如果 X X X Y Y Y是统计独立的,那么二者的协方差为0,这是因为在 X X X Y Y Y独立的情况下:

E ( X ⋅ Y ) = E ( X ) ⋅ E ( Y ) = μ υ E(X \cdot Y) = E(X) \cdot E(Y) = \mu\upsilon E(XY)=E(X)E(Y)=μυ

相关系数

协方差的数值大小取决于变量的大小,通常是不容易解释的,但是正态形式的协方差大小可以显示两变量线性关系的强弱。在统计学中,皮尔逊积矩相关系数用于度量两个变量 X X X Y Y Y之间的相关程度(线性相关),它的值介于-1到1之间。

ρ X , Y = c o v ( X , Y ) σ X σ Y \rho_{X,Y} = \frac {cov(X, Y)} {\sigma_{X}\sigma_{Y}} ρX,Y=σXσYcov(X,Y)

估算样本的协方差和标准差,可以得到样本皮尔逊系数,通常用英文小写字母 r r r表示。

r = ∑ i = 1 n ( X i − X ˉ ) ( Y i − Y ˉ ) ∑ i = 1 n ( X i − X ˉ ) 2 ∑ i = 1 n ( Y i − Y ˉ ) 2 r = \frac {\sum_{i=1}^{n}(X_i - \bar{X})(Y_i - \bar{Y})} {\sqrt{\sum_{i=1}^{n}(X_i - \bar{X})^2} \sqrt{\sum_{i=1}^{n}(Y_i - \bar{Y})^2}} r=i=1n(XiXˉ)2 i=1n(YiYˉ)2 i=1n(XiXˉ)(YiYˉ)

等价的表达式为:

r = 1 n − 1 ∑ i = 1 n ( X i − X ˉ σ X ) ( Y i − Y ˉ σ Y ) r = \frac {1} {n - 1} \sum_{i=1}^n \left( \frac {X_i - \bar{X}} {\sigma_X} \right) \left( \frac {Y_i - \bar{Y}} {\sigma_{Y}} \right) r=n11i=1n(σXXiXˉ)(σYYiYˉ)

  1. 皮尔逊相关系数
  • 两个变量之间是线性关系,都是连续数据。
  • 两个变量的总体是正态分布,或接近正态的单峰分布。
  • 两个变量的观测值是成对的,每对观测值之间相互独立。
  1. 斯皮尔曼相关系数

斯皮尔曼相关系数对数据条件的要求没有皮尔逊相关系数严格,只要两个变量的观测值是成对的等级评定资料,或者是由连续变量观测资料转化得到的等级资料,不论两个变量的总体分布形态、样本容量的大小如何,都可以用斯皮尔曼等级相关系数来进行研究。

# 安装/升级 scikit-learn 库
# !pip install -U scikit-learn
Looking in indexes: https://pypi.doubanio.com/simple
Requirement already satisfied: scikit-learn in d:\programs\python\python38\lib\site-packages (0.23.1)


WARNING: You are using pip version 21.2.4; however, version 21.3.1 is available.
You should consider upgrading via the 'd:\programs\python\python38\python.exe -m pip install --upgrade pip' command.


Collecting scikit-learn
  Downloading https://pypi.doubanio.com/packages/b8/b2/84f9ed357e35359e34ffd25381468e5120be863659ba9dac9ae161b391b0/scikit_learn-1.0.1-cp38-cp38-win_amd64.whl (7.2 MB)
Requirement already satisfied: scipy>=1.1.0 in d:\programs\python\python38\lib\site-packages (from scikit-learn) (1.4.1)
Requirement already satisfied: joblib>=0.11 in d:\programs\python\python38\lib\site-packages (from scikit-learn) (1.0.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in d:\programs\python\python38\lib\site-packages (from scikit-learn) (2.2.0)
Requirement already satisfied: numpy>=1.14.6 in d:\programs\python\python38\lib\site-packages (from scikit-learn) (1.18.2)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.23.1
    Uninstalling scikit-learn-0.23.1:
      Successfully uninstalled scikit-learn-0.23.1
Successfully installed scikit-learn-1.0.1
# 忽略警告信息
import warnings

warnings.filterwarnings('ignore')
from sklearn.datasets import load_boston

boston = load_boston()
boston
df_boston = pd.DataFrame(boston.data,columns= boston.feature_names)
df_boston['PRICE'] = boston.target
df_boston
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATPRICE
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.9824.0
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.1421.6
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.0334.7
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.9433.4
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.3336.2
.............................................
5010.062630.011.930.00.5736.59369.12.47861.0273.021.0391.999.6722.4
5020.045270.011.930.00.5736.12076.72.28751.0273.021.0396.909.0820.6
5030.060760.011.930.00.5736.97691.02.16751.0273.021.0396.905.6423.9
5040.109590.011.930.00.5736.79489.32.38891.0273.021.0393.456.4822.0
5050.047410.011.930.00.5736.03080.82.50501.0273.021.0396.907.8811.9

506 rows × 14 columns

# 协方差
df_boston.cov()['PRICE']
CRIM       -30.718508
ZN          77.315176
INDUS      -30.520823
CHAS         0.409409
NOX         -0.455412
RM           4.493446
AGE        -97.589017
DIS          4.840229
RAD        -30.561228
TAX       -726.255716
PTRATIO    -10.110657
B          279.989834
LSTAT      -48.447538
PRICE       84.586724
Name: PRICE, dtype: float64
plt.get_cmap('RdYlBu')
<matplotlib.colors.LinearSegmentedColormap at 0x1b71b058910>
df_boston.corr('spearman').style.background_gradient('spring')   # subset = ['PRICE'] - 指定某行上色
 CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATPRICE
CRIM1.000000-0.5716600.7355240.0415370.821465-0.3091160.704140-0.7449860.7278070.7290450.465283-0.3605550.634760-0.558891
ZN-0.5716601.000000-0.642811-0.041937-0.6348280.361074-0.5444230.614627-0.278767-0.371394-0.4484750.163135-0.4900740.438179
INDUS0.735524-0.6428111.0000000.0898410.791189-0.4153010.679487-0.7570800.4555070.6643610.433710-0.2858400.638747-0.578255
CHAS0.041537-0.0419370.0898411.0000000.0684260.0588130.067792-0.0802480.024579-0.044486-0.136065-0.039810-0.0505750.140612
NOX0.821465-0.6348280.7911890.0684261.000000-0.3103440.795153-0.8800150.5864290.6495270.391309-0.2966620.636828-0.562609
RM-0.3091160.361074-0.4153010.058813-0.3103441.000000-0.2780820.263168-0.107492-0.271898-0.3129230.053660-0.6408320.633576
AGE0.704140-0.5444230.6794870.0677920.795153-0.2780821.000000-0.8016100.4179830.5263660.355384-0.2280220.657071-0.547562
DIS-0.7449860.614627-0.757080-0.080248-0.8800150.263168-0.8016101.000000-0.495806-0.574336-0.3220410.249595-0.5642620.445857
RAD0.727807-0.2787670.4555070.0245790.586429-0.1074920.417983-0.4958061.0000000.7048760.318330-0.2825330.394322-0.346776
TAX0.729045-0.3713940.664361-0.0444860.649527-0.2718980.526366-0.5743360.7048761.0000000.453345-0.3298430.534423-0.562411
PTRATIO0.465283-0.4484750.433710-0.1360650.391309-0.3129230.355384-0.3220410.3183300.4533451.000000-0.0720270.467259-0.555905
B-0.3605550.163135-0.285840-0.039810-0.2966620.053660-0.2280220.249595-0.282533-0.329843-0.0720271.000000-0.2105620.185664
LSTAT0.634760-0.4900740.638747-0.0505750.636828-0.6408320.657071-0.5642620.3943220.5344230.467259-0.2105621.000000-0.852914
PRICE-0.5588910.438179-0.5782550.140612-0.5626090.633576-0.5475620.445857-0.346776-0.562411-0.5559050.185664-0.8529141.000000
# !pip install pandas-datareader
# 百度股票信息
import pandas_datareader as pdr

baidu_df = pdr.get_data_stooq('BIDU', start='2021-11-1', end='2021-12-8')
baidu_df.sort_index(inplace=True)
baidu_df
OpenHighLowCloseVolume
Date
2021-11-01162.5400170.5300162.46170.323922601
2021-11-02166.1400166.1400161.33162.274162546
2021-11-03163.1500165.5000162.66165.372482714
2021-11-04167.0200167.7800162.28162.583126021
2021-11-05163.6777163.6777157.76158.234272586
2021-11-08159.6600161.5800158.41161.422271899
2021-11-09160.5200162.8800158.59161.632405390
2021-11-10161.7700164.7500160.62161.563052889
2021-11-11164.8800168.8000164.10167.263121199
2021-11-12166.9750171.3500166.33170.573143286
2021-11-15171.4050171.9900167.69168.671940542
2021-11-16171.9100173.1700169.05171.273232698
2021-11-17173.3800173.6000160.07161.826474141
2021-11-18155.7800158.2000152.41154.365584166
2021-11-19154.1100154.9000151.04151.773989447
2021-11-22152.0000152.4700146.89147.814423414
2021-11-23147.9000151.6200147.76150.493376812
2021-11-24148.9300151.5800146.89151.392663238
2021-11-26148.5200154.4500147.89153.063267102
2021-11-29153.0000153.0000148.80150.293745624
2021-11-30149.1200151.4500147.01149.844815568
2021-12-01150.6600152.0000147.51148.834139282
2021-12-02148.4000151.5500145.20148.964979653
2021-12-03141.8500143.6000132.14137.3910411847
2021-12-06136.2500147.8000136.11147.654962278
2021-12-07150.1800151.6100147.10149.894327345
baidu_df.index
DatetimeIndex(['2021-11-01', '2021-11-02', '2021-11-03', '2021-11-04',
               '2021-11-05', '2021-11-08', '2021-11-09', '2021-11-10',
               '2021-11-11', '2021-11-12', '2021-11-15', '2021-11-16',
               '2021-11-17', '2021-11-18', '2021-11-19', '2021-11-22',
               '2021-11-23', '2021-11-24', '2021-11-26', '2021-11-29',
               '2021-11-30', '2021-12-01', '2021-12-02', '2021-12-03',
               '2021-12-06', '2021-12-07'],
              dtype='datetime64[ns]', name='Date', freq=None)
baidu_df.Close.rolling(10).mean()
Date
2021-11-01        NaN
2021-11-02        NaN
2021-11-03        NaN
2021-11-04        NaN
2021-11-05        NaN
2021-11-08        NaN
2021-11-09        NaN
2021-11-10        NaN
2021-11-11        NaN
2021-11-12    164.121
2021-11-15    163.956
2021-11-16    164.856
2021-11-17    164.501
2021-11-18    163.679
2021-11-19    163.033
2021-11-22    161.672
2021-11-23    160.558
2021-11-24    159.541
2021-11-26    158.121
2021-11-29    156.093
2021-11-30    154.210
2021-12-01    151.966
2021-12-02    150.680
2021-12-03    148.983
2021-12-06    148.571
2021-12-07    148.779
Name: Close, dtype: float64
plt.plot(np.arange(baidu_df.index.size), baidu_df.Close)
plt.xticks(np.arange(baidu_df.index.size),
           rotation=45,
           labels=baidu_df.index.month.astype(str).values + '-' + baidu_df.index.day.astype(str).values)
plt.yticks(np.arange(100, 181, 10))
plt.show()
    

Index 对象

# 范围索引
sales_data = np.random.randint(400, 1000, 12)
month_index = pd.RangeIndex(1, 13, name='月份')
ser = pd.Series(data=sales_data, index=month_index)
ser
月份
1     719
2     495
3     528
4     513
5     947
6     934
7     759
8     705
9     704
10    491
11    473
12    528
dtype: int32
ser.index
RangeIndex(start=1, stop=13, step=1, name='月份')
# 多级索引
ids = np.arange(1001, 1006)
sms = ['期中', '期末']
index = pd.MultiIndex.from_product((ids, sms), names=['学号', '学期'])
courses = ['语文', '数学', '英语']
scores = np.random.randint(60, 101, (10, 3))
df = pd.DataFrame(data=scores, columns=courses, index=index)
df
语文数学英语
学号学期
1001期中718795
期末6210095
1002期中669697
期末788367
1003期中888771
期末969462
1004期中858884
期末697287
1005期中686694
期末759263
df.reset_index(level=1)
学期语文数学英语
学号
1001期中718795
1001期末6210095
1002期中669697
1002期末788367
1003期中888771
1003期末969462
1004期中858884
1004期末697287
1005期中686694
1005期末759263
pd.date_range('2021-1-1', '2021-6-1', periods=10)   # 在指定时间区间取指定周期(个数)的日期
DatetimeIndex(['2021-01-01 00:00:00', '2021-01-17 18:40:00',
               '2021-02-03 13:20:00', '2021-02-20 08:00:00',
               '2021-03-09 02:40:00', '2021-03-25 21:20:00',
               '2021-04-11 16:00:00', '2021-04-28 10:40:00',
               '2021-05-15 05:20:00', '2021-06-01 00:00:00'],
              dtype='datetime64[ns]', freq=None)
temp = pd.date_range('2021-1-1', '2021-6-1', freq='W')   # 在指定时间区间以指定频率取日期
temp
DatetimeIndex(['2021-01-03', '2021-01-10', '2021-01-17', '2021-01-24',
               '2021-01-31', '2021-02-07', '2021-02-14', '2021-02-21',
               '2021-02-28', '2021-03-07', '2021-03-14', '2021-03-21',
               '2021-03-28', '2021-04-04', '2021-04-11', '2021-04-18',
               '2021-04-25', '2021-05-02', '2021-05-09', '2021-05-16',
               '2021-05-23', '2021-05-30'],
              dtype='datetime64[ns]', freq='W-SUN')
temp - pd.DateOffset(days=2)   # 序列中所有日期减去指定天数(days = 2),这个参数也可以设置为months、years
DatetimeIndex(['2021-01-01', '2021-01-08', '2021-01-15', '2021-01-22',
               '2021-01-29', '2021-02-05', '2021-02-12', '2021-02-19',
               '2021-02-26', '2021-03-05', '2021-03-12', '2021-03-19',
               '2021-03-26', '2021-04-02', '2021-04-09', '2021-04-16',
               '2021-04-23', '2021-04-30', '2021-05-07', '2021-05-14',
               '2021-05-21', '2021-05-28'],
              dtype='datetime64[ns]', freq=None)
temp + pd.DateOffset(days=2)   # 序列中所有日期加上指定天数(days = 2),这个参数也可以设置为months、years
DatetimeIndex(['2021-01-05', '2021-01-12', '2021-01-19', '2021-01-26',
               '2021-02-02', '2021-02-09', '2021-02-16', '2021-02-23',
               '2021-03-02', '2021-03-09', '2021-03-16', '2021-03-23',
               '2021-03-30', '2021-04-06', '2021-04-13', '2021-04-20',
               '2021-04-27', '2021-05-04', '2021-05-11', '2021-05-18',
               '2021-05-25', '2021-06-01'],
              dtype='datetime64[ns]', freq=None)
baidu_df.shift(3, fill_value=0)   # 将数据按照时间下标平移指定周期(3天),用0填充空缺值
OpenHighLowCloseVolume
Date
2021-11-010.00000.00000.000.000
2021-11-020.00000.00000.000.000
2021-11-030.00000.00000.000.000
2021-11-04162.5400170.5300162.46170.323922601
2021-11-05166.1400166.1400161.33162.274162546
2021-11-08163.1500165.5000162.66165.372482714
2021-11-09167.0200167.7800162.28162.583126021
2021-11-10163.6777163.6777157.76158.234272586
2021-11-11159.6600161.5800158.41161.422271899
2021-11-12160.5200162.8800158.59161.632405390
2021-11-15161.7700164.7500160.62161.563052889
2021-11-16164.8800168.8000164.10167.263121199
2021-11-17166.9750171.3500166.33170.573143286
2021-11-18171.4050171.9900167.69168.671940542
2021-11-19171.9100173.1700169.05171.273232698
2021-11-22173.3800173.6000160.07161.826474141
2021-11-23155.7800158.2000152.41154.365584166
2021-11-24154.1100154.9000151.04151.773989447
2021-11-26152.0000152.4700146.89147.814423414
2021-11-29147.9000151.6200147.76150.493376812
2021-11-30148.9300151.5800146.89151.392663238
2021-12-01148.5200154.4500147.89153.063267102
2021-12-02153.0000153.0000148.80150.293745624
2021-12-03149.1200151.4500147.01149.844815568
2021-12-06150.6600152.0000147.51148.834139282
2021-12-07148.4000151.5500145.20148.964979653
baidu_df.asfreq('10D', method='ffill')   # 以指定频率(10D)提取数据,空缺值使用前/后(method = 'ffill'/'bfill')一数据填充
OpenHighLowCloseVolume
Date
2021-11-01162.54170.53162.46170.323922601
2021-11-11164.88168.80164.10167.263121199
2021-11-21154.11154.90151.04151.773989447
2021-12-01150.66152.00147.51148.834139282
baidu_df.resample('1M').mean()   # 将数据按月划分(M:月;D:天;Y:年),划分后本身是个迭代对象。可作用聚合函数
OpenHighLowCloseVolume
Date
2021-11-30160.1137162.353224157.144762159.6180953.593994e+06
2021-12-31145.4680149.312000141.612000146.5440005.764081e+06
baidu_df.resample('10D').Volume.sum()
Date
2021-11-01    25696646
2021-11-11    27485479
2021-11-21    22291758
2021-12-01    28820405
Freq: 10D, Name: Volume, dtype: int64
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值