Pandas 总结试卷

逸巽散人

已于 2024-09-26 23:01:48 修改

阅读量1.1k

点赞数 14

文章标签： pandas

于 2024-09-26 22:56:56 首次发布

本文链接：https://blog.csdn.net/weixin_42039453/article/details/142579835

版权

Pandas 总结试卷

# 导入pandas库
import pandas as pd

# 0. 创建一个空的DataFrame
df = pd.DataFrame()
df

# 创建一个包含学生姓名、年龄、成绩的DataFrame
# 数据表为：
'''
姓名	年龄	成绩
张三	18	85
李四	19	90
王五	20	78
赵六	21	92
'''

data = {'姓名': ['张三', '李四', '王五', '赵六'],
        '年龄': [18, 19, 20, 21],
        '成绩': [85, 90, 78, 92]}
df = pd.DataFrame(data)
df

	姓名	年龄	成绩
0	张三	18	85
1	李四	19	90
2	王五	20	78
3	赵六	21	92

# 查看DataFrame的前3行和后2行
print(df.head(3))
print(df.tail(2))

   姓名  年龄  成绩
0  张三  18  85
1  李四  19  90
2  王五  20  78
   姓名  年龄  成绩
2  王五  20  78
3  赵六  21  92

# 导入data1.csv文件, 赋值给df1
df1 = pd.read_csv('data1.csv', sep='\t')
df1

	order_id	quantity	item_name	choice_description	item_price
0	1	1	Chips and Fresh Tomato Salsa	NaN	$2.39
1	1	1	Izze	[Clementine]	$3.39
2	1	1	Nantucket Nectar	[Apple]	$3.39
3	1	1	Chips and Tomatillo-Green Chili Salsa	NaN	$2.39
4	2	2	Chicken Bowl	[Tomatillo-Red Chili Salsa (Hot), [Black Beans...	$16.98
...	...	...	...	...	...
4617	1833	1	Steak Burrito	[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...	$11.75
4618	1833	1	Steak Burrito	[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...	$11.75
4619	1834	1	Chicken Salad Bowl	[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...	$11.25
4620	1834	1	Chicken Salad Bowl	[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...	$8.75
4621	1834	1	Chicken Salad Bowl	[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...	$8.75

4622 rows × 5 columns

# 将df1保存为data1.xlsx文件
df1.to_excel('data1.xlsx', index=False)

# 创建一个包含数字1至10的一维Series
s = pd.Series(range(1, 11))
s

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
dtype: int64

# 将DataFrame `df` 转换为Numpy数组
df.values

array([['张三', 18, 85],
       ['李四', 19, 90],
       ['王五', 20, 78],
       ['赵六', 21, 92]], dtype=object)

# 将DataFrame `df` 转换为字典
df.to_dict()

{'姓名': {0: '张三', 1: '李四', 2: '王五', 3: '赵六'},
 '年龄': {0: 18, 1: 19, 2: 20, 3: 21},
 '成绩': {0: 85, 1: 90, 2: 78, 3: 92}}

# 导入data2.xlsx，其中数据表在Sheet2中，赋值给df2
df2 = pd.read_excel('data2.xlsx', sheet_name='Sheet2')
df2

# 发现df2的表头有问题，需要跳过3行，直接读取数据，并将60列表头设置为：
# ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 
# 'A11', 'A12', 'A13', 'A14', 'A15', 'A16', 'A17', 'A18', 'A19', 'A20',
# 'A21', 'A22', 'A23', 'A24', 'A25', 'A26', 'A27', 'A28', 'A29', 'A30',
# 'A31', 'A32', 'A33', 'A34', 'A35', 'A36', 'A37', 'A38', 'A39', 'A40',
# 'A41', 'A42', 'A43', 'A44', 'A45', 'A46', 'A47', 'A48', 'A49', 'A50',
# 'A51', 'A52', 'A53', 'A54', 'A55', 'A56', 'A57', 'A58', 'A59', 'A60']
df2 = pd.read_excel('data2.xlsx', sheet_name='Sheet2', skiprows=3, header=0)
df2.columns = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10',
               'A11', 'A12', 'A13', 'A14', 'A15', 'A16', 'A17', 'A18', 'A19', 'A20',
               'A21', 'A22', 'A23', 'A24', 'A25', 'A26', 'A27', 'A28', 'A29', 'A30',
               'A31', 'A32', 'A33', 'A34', 'A35', 'A36', 'A37', 'A38', 'A39', 'A40',
               'A41', 'A42', 'A43', 'A44', 'A45', 'A46', 'A47', 'A48', 'A49', 'A50',
               'A51', 'A52', 'A53', 'A54', 'A55', 'A56', 'A57', 'A58', 'A59', 'A60']
df2.head(1)

# 将A16列和A22列按照日期的格式读入，或者转化为日期格式
df2['A16'] = pd.to_datetime(df2['A16'])
df2['A22'] = pd.to_datetime(df2['A22'], format='%Y%m%d')
print(df2['A16'])
print(df2['A22'])

0       2016-01-01
1       2016-01-01
2       2022-10-26
3       2017-01-01
4       2016-01-01
           ...    
23946   2016-01-01
23947   2020-07-01
23948   2018-01-01
23949   2016-01-01
23950   2016-01-01
Name: A16, Length: 23951, dtype: datetime64[ns]
0       2023-12-17
1       2023-12-17
2       2023-12-17
3       2023-12-17
4       2023-12-17
           ...    
23946   2023-12-17
23947   2023-12-17
23948   2023-12-17
23949   2023-12-17
23950   2023-12-17
Name: A22, Length: 23951, dtype: datetime64[ns]

from sklearn import datasets
# 读取iris数据集，赋值给iris
iris_ = datasets.load_iris()
iris = pd.DataFrame(data=iris_.data, columns=iris_.feature_names)
iris['target'] = iris_.target
iris

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
0	5.1	3.5	1.4	0.2	0
1	4.9	3.0	1.4	0.2	0
2	4.7	3.2	1.3	0.2	0
3	4.6	3.1	1.5	0.2	0
4	5.0	3.6	1.4	0.2	0
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	2
146	6.3	2.5	5.0	1.9	2
147	6.5	3.0	5.2	2.0	2
148	6.2	3.4	5.4	2.3	2
149	5.9	3.0	5.1	1.8	2

150 rows × 5 columns

# 查看iris的基本信息
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.4 KB

# 查看iris的描述性统计
iris.describe()

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
count	150.000000	150.000000	150.000000	150.000000	150.000000
mean	5.843333	3.057333	3.758000	1.199333	1.000000
std	0.828066	0.435866	1.765298	0.762238	0.819232
min	4.300000	2.000000	1.000000	0.100000	0.000000
25%	5.100000	2.800000	1.600000	0.300000	0.000000
50%	5.800000	3.000000	4.350000	1.300000	1.000000
75%	6.400000	3.300000	5.100000	1.800000	2.000000
max	7.900000	4.400000	6.900000	2.500000	2.000000

# 利用describe()的数据，示iris的四分位数
iris.describe().loc[['25%', '50%', '75%'], :]

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
25%	5.1	2.8	1.60	0.3	0.0
50%	5.8	3.0	4.35	1.3	1.0
75%	6.4	3.3	5.10	1.8	2.0

# 也可以用直接制作表格，展示iris的四分位数
iris.quantile([0.25, 0.5, 0.75])

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
0.25	5.1	2.8	1.60	0.3	0.0
0.50	5.8	3.0	4.35	1.3	1.0
0.75	6.4	3.3	5.10	1.8	2.0

# 检查iris中是否有缺失值
iris.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64

# 读取iris数据集，赋值给iris
iris_ = datasets.load_iris()
iris = pd.DataFrame(data=iris_.data, columns=iris_.feature_names)
iris['target'] = iris_.target
# 从前四列分别随机挑选5个数字将其替换为缺失值
import numpy as np
# 从前四列分别随机挑选5个数字将其替换为缺失值  
for column in iris.columns[:4]:  # 只处理前四列  
    # 随机选择2个索引  
    random_indices = np.random.choice(iris.index, 2, replace=False)  
    # 将选中的值替换为NaN  
    iris.loc[random_indices, column] = np.nan 
# 查看这些有缺失值的行
iris[iris.isnull().any(axis=1)]

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
7	5.0	3.4	1.5	NaN	0
8	NaN	2.9	1.4	0.2	0
39	5.1	3.4	NaN	0.2	0
71	6.1	2.8	4.0	NaN	1
75	6.6	NaN	4.4	1.4	1
86	6.7	3.1	NaN	1.5	1
115	6.4	NaN	5.3	2.3	2
137	NaN	3.1	5.5	1.8	2

# 每个包含确实值的行，也同样对应了一个target值，请用target值相同的行的均值填充缺失值
iris = iris.fillna(iris.groupby('target').transform('median'))

# 检验
iris[iris.isnull().any(axis=1)]

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target

# 查看target的每个值出现的次数
iris['target'].value_counts()

target
0    50
1    50
2    50
Name: count, dtype: int64

# 删除第一行数据后，查看target的每个值出现的次数
iris.drop(0, axis=0, inplace=True)
iris['target'].value_counts()

target
1    50
2    50
0    49
Name: count, dtype: int64

# 将列名改为中文
# ['花萼长度', '花萼宽度', '花瓣长度', '花瓣宽度', '类别']
iris.columns = ['花萼长度', '花萼宽度', '花瓣长度', '花瓣宽度', '类别']
iris

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
1	4.9	3.0	1.4	0.2	0
2	4.7	3.2	1.3	0.2	0
3	4.6	3.1	1.5	0.2	0
4	5.0	3.6	1.4	0.2	0
5	5.4	3.9	1.7	0.4	0
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	2
146	6.3	2.5	5.0	1.9	2
147	6.5	3.0	5.2	2.0	2
148	6.2	3.4	5.4	2.3	2
149	5.9	3.0	5.1	1.8	2

149 rows × 5 columns

# 计算花萼长度和花萼宽度的相关系数
iris['花萼长度'].corr(iris['花萼宽度'])

-0.11924860426893708

# 过滤出花萼长度大于7的数据
iris[iris['花萼长度'] > 7]

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
102	7.1	3.0	5.9	2.1	2
105	7.6	3.0	6.6	2.1	2
107	7.3	2.9	6.3	1.8	2
109	7.2	3.6	6.1	2.5	2
117	7.7	3.8	6.7	2.2	2
118	7.7	2.6	6.9	2.3	2
122	7.7	2.8	6.7	2.0	2
125	7.2	3.2	6.0	1.8	2
129	7.2	3.0	5.8	1.6	2
130	7.4	2.8	6.1	1.9	2
131	7.9	3.8	6.4	2.0	2
135	7.7	3.0	6.1	2.3	2

# 切片出花萼长度大于7且花萼宽度小于3的数据
iris[(iris['花萼长度'] > 7) & (iris['花萼宽度'] < 3)]

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
107	7.3	2.9	6.3	1.8	2
118	7.7	2.6	6.9	2.3	2
122	7.7	2.8	6.7	2.0	2
130	7.4	2.8	6.1	1.9	2

# 切片第三行到第五行的数据
iris.iloc[2:5, :]

	花萼长度	花萼宽度	花瓣长度	花瓣宽度
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2
5	5.4	3.9	1.7	0.4

# 切片第三行到第五行的花萼长度和花萼宽度数据
iris.iloc[2:5, [0, 1]]

	花萼长度	花萼宽度
3	4.6	3.1
4	5.0	3.6
5	5.4	3.9

# 过滤类别为2的数据的前4行
iris[iris['类别'] == 2].head(4)

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
100	6.3	3.3	6.0	2.5	2
101	5.8	2.7	5.1	1.9	2
102	7.1	3.0	5.9	2.1	2
103	6.3	2.9	5.6	1.8	2

# 过滤类别为1的数据的第3-5行数据
iris[iris['类别'] == 1].iloc[2:5, :]

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
52	6.9	3.1	4.9	1.5	1
53	5.5	2.3	4.0	1.3	1
54	6.5	2.8	4.6	1.5	1

# 过滤出花萼长度大于7的数据，并按照花瓣长度降序排列
iris[iris['花萼长度'] > 7].sort_values(by='花瓣长度', ascending=False)

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
118	7.7	2.6	6.9	2.3	2
117	7.7	3.8	6.7	2.2	2
122	7.7	2.8	6.7	2.0	2
105	7.6	3.0	6.6	2.1	2
131	7.9	3.8	6.4	2.0	2
107	7.3	2.9	6.3	1.8	2
109	7.2	3.6	6.1	2.5	2
130	7.4	2.8	6.1	1.9	2
135	7.7	3.0	6.1	2.3	2
125	7.2	3.2	6.0	1.8	2
102	7.1	3.0	5.9	2.1	2
129	7.2	3.0	5.8	1.6	2

# 将iris按照花萼长度降序排列，重置索引
iris.sort_values(by='花萼长度', ascending=False).reset_index(drop=True)

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
0	7.9	3.8	6.4	2.0	2
1	7.7	3.0	6.1	2.3	2
2	7.7	2.8	6.7	2.0	2
3	7.7	3.8	6.7	2.2	2
4	7.7	2.6	6.9	2.3	2
...	...	...	...	...	...
144	4.6	3.6	1.0	0.2	0
145	4.5	2.3	1.3	0.3	0
146	4.4	3.0	1.3	0.2	0
147	4.4	3.2	1.3	0.2	0
148	4.3	3.0	1.1	0.1	0

149 rows × 5 columns

# 创建适合进行groupby操作的DataFrame
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
                   'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C': [1, 3, 2, 5, 4, 1, 2, 3],
                   'D': [2, 4, 5, 5, 1, 2, 4, 4],
                   'E': [1, 2, 3, 4, 5, 6, 7, 8],
                   'F': [2, 3, 4, 1, 2, 3, 4, 4]})
df

	A	B	C	D	E	F
0	foo	one	1	2	1	2
1	bar	one	3	4	2	3
2	foo	two	2	5	3	4
3	bar	three	5	5	4	1
4	foo	two	4	1	5	2
5	bar	two	1	2	6	3
6	foo	one	2	4	7	4
7	foo	three	3	4	8	4

# 按照A列和C列进行分组，计算D列的均值
df.groupby(['A', 'C'])['D'].mean()

A    C
bar  1    2.0
     3    4.0
     5    5.0
foo  1    2.0
     2    4.5
     3    4.0
     4    1.0
Name: D, dtype: float64

# 按照A列分组，使用聚合函数获取C列的均值、最大值、最小值、标准差和方差
df.groupby('A')['C'].agg(['mean', 'max', 'min', 'std', 'var'])

	mean	max	min	std	var
A
bar	3.0	5	1	2.000000	4.0
foo	2.4	4	1	1.140175	1.3

# 重命名聚合函数的列名{'mean': '均值', 'max': '最大值', 'min': '最小值', 'std': '标准差', 'var': '方差'}
df.groupby('A')['C'].agg(['mean', 'max', 'min', 'std', 'var']) \
                    .rename(columns={'mean': '均值', 
                                     'max': '最大值', 
                                     'min': '最小值', 
                                     'std': '标准差', 
                                     'var': '方差'})

	均值	最大值	最小值	标准差	方差
A
bar	3.0	5	1	2.000000	4.0
foo	2.4	4	1	1.140175	1.3

# 按照A列分组，对C列进行求和，并计算D列的均值
df.groupby('A').agg({'C': 'sum', 'D': 'mean'})

	C	D
A
bar	9	3.666667
foo	12	3.200000

# 按照A列分组，使用apply对C列应用函数，值乘以2
df.groupby(['A'])['C'].apply(lambda x: x * 2)

A     
bar  1     6
     3    10
     5     2
foo  0     2
     2     4
     4     8
     6     4
     7     6
Name: C, dtype: int64

# 将B列的值对应修改为整数
df['B'] = df['B'].map({'one': 1, 'two': 2, 'three': 3})
df

	A	B	C	D	E	F
0	foo	1	1	2	1	2
1	bar	1	3	4	2	3
2	foo	2	2	5	3	4
3	bar	3	5	5	4	1
4	foo	2	4	1	5	2
5	bar	2	1	2	6	3
6	foo	1	2	4	7	4
7	foo	3	3	4	8	4

# 在A列中每个foo后面加一个o，在bar前面加一个r
df['A'] = df['A'].str.replace('foo', 'fooo').str.replace('bar', 'rbar')
df

	A	B	C	D	E	F
0	fooo	1	1	2	1	2
1	rbar	1	3	4	2	3
2	fooo	2	2	5	3	4
3	rbar	3	5	5	4	1
4	fooo	2	4	1	5	2
5	rbar	2	1	2	6	3
6	fooo	1	2	4	7	4
7	fooo	3	3	4	8	4

# 创建两个df 用来练习 merge
# df1为用户编号、用户姓名、计费类型
df1 = pd.DataFrame({'id': ['001', '002', '003', '004'],
                    'name': ['Alice', 'Bob', 'Charlie', 'David'],
                    'type': ['A', 'B', 'A', 'B']})
# df2为用户编号、用户电话、电费
df2 = pd.DataFrame({'id': ['001', '002', '003', '004', '005'],
                    'phone': ['123456789', '987654321', '111111111', '222222222', '333333333'],
                    'bill': [100, 200, 300, 400, 500]})
print(df1)
print(df2)

    id     name type
0  001    Alice    A
1  002      Bob    B
2  003  Charlie    A
3  004    David    B
    id      phone  bill
0  001  123456789   100
1  002  987654321   200
2  003  111111111   300
3  004  222222222   400
4  005  333333333   500

# 将df1和df2按照id列进行内连接
df1.merge(df2, on='id', how='inner')

	id	name	type	phone	bill
0	001	Alice	A	123456789	100
1	002	Bob	B	987654321	200
2	003	Charlie	A	111111111	300
3	004	David	B	222222222	400

# 将df1和df2按照id列进行左连接
df1.merge(df2, on='id', how='left')

	id	name	type	phone	bill
0	001	Alice	A	123456789	100
1	002	Bob	B	987654321	200
2	003	Charlie	A	111111111	300
3	004	David	B	222222222	400

# 将df1和df2按照id列进行右连接
df1.merge(df2, on='id', how='right')

	id	name	type	phone	bill
0	001	Alice	A	123456789	100
1	002	Bob	B	987654321	200
2	003	Charlie	A	111111111	300
3	004	David	B	222222222	400
4	005	NaN	NaN	333333333	500

# 将df1和df2按照id列进行外连接
df1.merge(df2, on='id', how='outer')

	id	name	type	phone	bill
0	001	Alice	A	123456789	100
1	002	Bob	B	987654321	200
2	003	Charlie	A	111111111	300
3	004	David	B	222222222	400
4	005	NaN	NaN	333333333	500

# 使用concat函数将df1和df2纵向拼接
pd.concat([df1, df2], axis=0)

	id	name	type	phone	bill
0	001	Alice	A	NaN	NaN
1	002	Bob	B	NaN	NaN
2	003	Charlie	A	NaN	NaN
3	004	David	B	NaN	NaN
0	001	NaN	NaN	123456789	100.0
1	002	NaN	NaN	987654321	200.0
2	003	NaN	NaN	111111111	300.0
3	004	NaN	NaN	222222222	400.0
4	005	NaN	NaN	333333333	500.0

# 使用concat函数将df1和df2横向拼接
pd.concat([df1, df2], axis=1)

	id	name	type	id	phone	bill
0	001	Alice	A	001	123456789	100
1	002	Bob	B	002	987654321	200
2	003	Charlie	A	003	111111111	300
3	004	David	B	004	222222222	400
4	NaN	NaN	NaN	005	333333333	500

# 创建一个日期范围，从2021年1月1日开始，生成10个日期
pd.date_range(start='2021-01-01', periods=10)

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
               '2021-01-09', '2021-01-10'],
              dtype='datetime64[ns]', freq='D')

# 读取data3.csv文件
df3 = pd.read_csv('data3.csv')
df3

	Date	Open	High	Low	Close	Volume	Adj Close
0	2014-07-08	96.27	96.80	93.92	95.35	65130000	95.35
1	2014-07-07	94.14	95.99	94.10	95.97	56305400	95.97
2	2014-07-03	93.67	94.10	93.20	94.03	22891800	94.03
3	2014-07-02	93.87	94.06	93.09	93.48	28420900	93.48
4	2014-07-01	93.52	94.07	93.13	93.52	38170200	93.52
...	...	...	...	...	...	...	...
8460	1980-12-18	26.63	26.75	26.63	26.63	18362400	0.41
8461	1980-12-17	25.87	26.00	25.87	25.87	21610400	0.40
8462	1980-12-16	25.37	25.37	25.25	25.25	26432000	0.39
8463	1980-12-15	27.38	27.38	27.25	27.25	43971200	0.42
8464	1980-12-12	28.75	28.87	28.75	28.75	117258400	0.45

8465 rows × 7 columns

# 查看data3.csv文件格列的数据类型
df3.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Volume         int64
Adj Close    float64
dtype: object

# 将data3.csv文件中的'Date'列列转换为日期类型
df3['Date'] = pd.to_datetime(df3['Date'])
df3

	Date	Open	High	Low	Close	Volume	Adj Close
0	2014-07-08	96.27	96.80	93.92	95.35	65130000	95.35
1	2014-07-07	94.14	95.99	94.10	95.97	56305400	95.97
2	2014-07-03	93.67	94.10	93.20	94.03	22891800	94.03
3	2014-07-02	93.87	94.06	93.09	93.48	28420900	93.48
4	2014-07-01	93.52	94.07	93.13	93.52	38170200	93.52
...	...	...	...	...	...	...	...
8460	1980-12-18	26.63	26.75	26.63	26.63	18362400	0.41
8461	1980-12-17	25.87	26.00	25.87	25.87	21610400	0.40
8462	1980-12-16	25.37	25.37	25.25	25.25	26432000	0.39
8463	1980-12-15	27.38	27.38	27.25	27.25	43971200	0.42
8464	1980-12-12	28.75	28.87	28.75	28.75	117258400	0.45

8465 rows × 7 columns

# 将Date列设置为索引
df3.set_index('Date', inplace=True)
df3

	Open	High	Low	Close	Volume	Adj Close
Date
2014-07-08	96.27	96.80	93.92	95.35	65130000	95.35
2014-07-07	94.14	95.99	94.10	95.97	56305400	95.97
2014-07-03	93.67	94.10	93.20	94.03	22891800	94.03
2014-07-02	93.87	94.06	93.09	93.48	28420900	93.48
2014-07-01	93.52	94.07	93.13	93.52	38170200	93.52
...	...	...	...	...	...	...
1980-12-18	26.63	26.75	26.63	26.63	18362400	0.41
1980-12-17	25.87	26.00	25.87	25.87	21610400	0.40
1980-12-16	25.37	25.37	25.25	25.25	26432000	0.39
1980-12-15	27.38	27.38	27.25	27.25	43971200	0.42
1980-12-12	28.75	28.87	28.75	28.75	117258400	0.45

8465 rows × 6 columns

# 对时间序列数据重采样为按月的数据，并计算每个月的其他列的和
df3.resample('M').sum()

C:\Users\ksufe\AppData\Local\Temp\ipykernel_10340\3032457932.py:2: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
  df3.resample('M').sum()

	Open	High	Low	Close	Volume	Adj Close
Date
1980-12-31	396.26	397.38	395.76	395.76	336212800	6.15
1981-01-31	666.85	668.36	664.75	664.75	152247200	10.37
1981-02-28	503.12	504.87	501.75	501.75	80404800	7.81
1981-03-31	548.63	550.37	546.40	546.40	175179200	8.53
1981-04-30	573.02	574.73	571.77	571.77	134232000	8.89
...	...	...	...	...	...	...
2014-03-31	11205.46	11265.53	11131.49	11197.50	1250424700	1590.75
2014-04-30	11341.72	11431.33	11261.51	11362.56	1608765200	1614.21
2014-05-31	12627.34	12733.83	12564.99	12667.11	1433917100	1807.23
2014-06-30	4669.56	4705.77	4635.45	4675.82	1206556300	1929.60
2014-07-31	471.47	475.02	467.44	472.35	210918300	472.35

404 rows × 6 columns

# 对时间序列数据重采样为按年统计的数据，并计算每年的其他列的平均值
df3.resample('Y').mean()

C:\Users\ksufe\AppData\Local\Temp\ipykernel_10340\2278679952.py:2: FutureWarning: 'Y' is deprecated and will be removed in a future version, please use 'YE' instead.
  df3.resample('Y').mean()

	Open	High	Low	Close	Volume	Adj Close
Date
1980-12-31	30.481538	30.567692	30.443077	30.443077	2.586252e+07	0.473077
1981-12-31	24.386349	24.471865	24.311151	24.311151	8.131889e+06	0.378651
1982-12-31	19.139723	19.412688	18.957036	19.142727	2.111167e+07	0.298261
1983-12-31	37.524841	38.376071	36.669841	37.521984	4.134987e+07	0.584643
1984-12-31	26.869960	27.393755	26.351581	26.801897	4.148126e+07	0.417787
1985-12-31	20.378814	20.595178	20.128656	20.194941	4.495383e+07	0.314862
1986-12-31	32.387391	32.938498	31.853676	32.460672	5.269093e+07	0.505494
1987-12-31	53.822688	55.036443	52.694585	53.889526	5.906256e+07	1.215652
1988-12-31	41.555889	42.186364	40.890356	41.538893	4.080334e+07	1.305771
1989-12-31	41.615000	42.300238	40.978611	41.658571	5.050181e+07	1.322341
1990-12-31	37.502016	38.219486	36.817233	37.561937	4.387544e+07	1.205257
1991-12-31	52.451542	53.425534	51.506877	52.494545	5.666764e+07	1.702292
1992-12-31	54.803661	55.602047	53.965000	54.802835	4.049007e+07	1.792795
1993-12-31	41.063241	41.778300	40.284783	41.026601	5.578353e+07	1.354664
1994-12-31	34.052222	34.711548	33.412897	34.080317	5.670228e+07	1.142738
1995-12-31	40.623056	41.267024	39.908413	40.540476	7.367712e+07	1.375556
1996-12-31	25.048110	25.421378	24.504803	24.919409	5.235652e+07	0.850709
1997-12-31	18.032372	18.360237	17.628972	17.966403	7.111004e+07	0.613123
1998-12-31	30.512381	31.265556	29.776627	30.564603	1.142800e+08	1.043413
1999-12-31	57.659484	59.099881	56.300992	57.770278	1.360146e+08	1.972063
2000-12-31	71.863889	74.191230	69.609563	71.749206	1.193468e+08	3.120873
2001-12-31	20.165323	20.766290	19.622379	20.219355	9.542117e+07	1.380484
2002-12-31	19.128056	19.522063	18.716270	19.139444	7.640271e+07	1.306825
2003-12-31	18.521786	18.843492	18.206984	18.544762	7.066493e+07	1.265992
2004-12-31	35.421468	36.029444	34.924643	35.526944	1.208350e+08	2.425635
2005-12-31	52.349683	53.111230	51.588214	52.401746	1.809534e+08	6.373651
2006-12-31	70.987610	71.939124	69.810359	70.810637	2.148396e+08	9.668964
2007-12-31	128.389084	130.070478	126.184502	128.273904	2.460119e+08	17.515219
2008-12-31	142.313755	145.110672	138.857708	141.979012	2.825901e+08	19.386324
2009-12-31	146.619087	148.495675	144.964881	146.814127	1.421168e+08	20.047063
2010-12-31	259.957619	262.368810	256.847619	259.842460	1.498263e+08	35.479802
2011-12-31	364.061429	367.423571	360.297698	364.004325	1.230747e+08	49.703135
2012-12-31	576.652720	581.825400	569.921160	576.049720	1.319642e+08	78.847600
2013-12-31	473.128135	477.638929	468.247103	472.634881	1.016087e+08	65.994563
2014-12-31	477.553256	481.363721	474.229922	478.036589	7.265242e+07	80.837674

# 重置索引
df3.reset_index(inplace=True)
df3

	Date	Open	High	Low	Close	Volume	Adj Close
0	2014-07-08	96.27	96.80	93.92	95.35	65130000	95.35
1	2014-07-07	94.14	95.99	94.10	95.97	56305400	95.97
2	2014-07-03	93.67	94.10	93.20	94.03	22891800	94.03
3	2014-07-02	93.87	94.06	93.09	93.48	28420900	93.48
4	2014-07-01	93.52	94.07	93.13	93.52	38170200	93.52
...	...	...	...	...	...	...	...
8460	1980-12-18	26.63	26.75	26.63	26.63	18362400	0.41
8461	1980-12-17	25.87	26.00	25.87	25.87	21610400	0.40
8462	1980-12-16	25.37	25.37	25.25	25.25	26432000	0.39
8463	1980-12-15	27.38	27.38	27.25	27.25	43971200	0.42
8464	1980-12-12	28.75	28.87	28.75	28.75	117258400	0.45

8465 rows × 7 columns

# 生成年、月、日、星期的列
df3['year'] = df3['Date'].dt.year
df3['month'] = df3['Date'].dt.month
df3['day'] = df3['Date'].dt.day
df3['weekday'] = df3['Date'].dt.weekday
df3

	Date	Open	High	Low	Close	Volume	Adj Close	year	month	day	weekday
0	2014-07-08	96.27	96.80	93.92	95.35	65130000	95.35	2014	7	8	1
1	2014-07-07	94.14	95.99	94.10	95.97	56305400	95.97	2014	7	7	0
2	2014-07-03	93.67	94.10	93.20	94.03	22891800	94.03	2014	7	3	3
3	2014-07-02	93.87	94.06	93.09	93.48	28420900	93.48	2014	7	2	2
4	2014-07-01	93.52	94.07	93.13	93.52	38170200	93.52	2014	7	1	1
...	...	...	...	...	...	...	...	...	...	...	...
8460	1980-12-18	26.63	26.75	26.63	26.63	18362400	0.41	1980	12	18	3
8461	1980-12-17	25.87	26.00	25.87	25.87	21610400	0.40	1980	12	17	2
8462	1980-12-16	25.37	25.37	25.25	25.25	26432000	0.39	1980	12	16	1
8463	1980-12-15	27.38	27.38	27.25	27.25	43971200	0.42	1980	12	15	0
8464	1980-12-12	28.75	28.87	28.75	28.75	117258400	0.45	1980	12	12	4

8465 rows × 11 columns

# 生成中华人民共和国成立当天的日期
new_china = pd.to_datetime('1949-10-01')
new_china

Timestamp('1949-10-01 00:00:00')

# 提取new_china的年份、月份、日期、星期
new_china.year, new_china.month, new_china.day, new_china.weekday()

(1949, 10, 1, 5)

# 检查当天是否为工作日
new_china.weekday() < 5

False

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
0	5.1	3.5	1.4	0.2	0
1	4.9	3.0	1.4	0.2	0
2	4.7	3.2	1.3	0.2	0
3	4.6	3.1	1.5	0.2	0
4	5.0	3.6	1.4	0.2	0
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	2
146	6.3	2.5	5.0	1.9	2
147	6.5	3.0	5.2	2.0	2
148	6.2	3.4	5.4	2.3	2
149	5.9	3.0	5.1	1.8	2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
7	5.0	3.4	1.5	NaN	0
8	NaN	2.9	1.4	0.2	0
39	5.1	3.4	NaN	0.2	0
71	6.1	2.8	4.0	NaN	1
75	6.6	NaN	4.4	1.4	1
86	6.7	3.1	NaN	1.5	1
115	6.4	NaN	5.3	2.3	2
137	NaN	3.1	5.5	1.8	2

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
1	4.9	3.0	1.4	0.2	0
2	4.7	3.2	1.3	0.2	0
3	4.6	3.1	1.5	0.2	0
4	5.0	3.6	1.4	0.2	0
5	5.4	3.9	1.7	0.4	0
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	2
146	6.3	2.5	5.0	1.9	2
147	6.5	3.0	5.2	2.0	2
148	6.2	3.4	5.4	2.3	2
149	5.9	3.0	5.1	1.8	2

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
102	7.1	3.0	5.9	2.1	2
105	7.6	3.0	6.6	2.1	2
107	7.3	2.9	6.3	1.8	2
109	7.2	3.6	6.1	2.5	2
117	7.7	3.8	6.7	2.2	2
118	7.7	2.6	6.9	2.3	2
122	7.7	2.8	6.7	2.0	2
125	7.2	3.2	6.0	1.8	2
129	7.2	3.0	5.8	1.6	2
130	7.4	2.8	6.1	1.9	2
131	7.9	3.8	6.4	2.0	2
135	7.7	3.0	6.1	2.3	2

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
107	7.3	2.9	6.3	1.8	2
118	7.7	2.6	6.9	2.3	2
122	7.7	2.8	6.7	2.0	2
130	7.4	2.8	6.1	1.9	2

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
100	6.3	3.3	6.0	2.5	2
101	5.8	2.7	5.1	1.9	2
102	7.1	3.0	5.9	2.1	2
103	6.3	2.9	5.6	1.8	2

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
118	7.7	2.6	6.9	2.3	2
117	7.7	3.8	6.7	2.2	2
122	7.7	2.8	6.7	2.0	2
105	7.6	3.0	6.6	2.1	2
131	7.9	3.8	6.4	2.0	2
107	7.3	2.9	6.3	1.8	2
109	7.2	3.6	6.1	2.5	2
130	7.4	2.8	6.1	1.9	2
135	7.7	3.0	6.1	2.3	2
125	7.2	3.2	6.0	1.8	2
102	7.1	3.0	5.9	2.1	2
129	7.2	3.0	5.8	1.6	2

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
0	7.9	3.8	6.4	2.0	2
1	7.7	3.0	6.1	2.3	2
2	7.7	2.8	6.7	2.0	2
3	7.7	3.8	6.7	2.2	2
4	7.7	2.6	6.9	2.3	2
...	...	...	...	...	...
144	4.6	3.6	1.0	0.2	0
145	4.5	2.3	1.3	0.3	0
146	4.4	3.0	1.3	0.2	0
147	4.4	3.2	1.3	0.2	0
148	4.3	3.0	1.1	0.1	0

	A	B	C	D	E	F
0	foo	1	1	2	1	2
1	bar	1	3	4	2	3
2	foo	2	2	5	3	4
3	bar	3	5	5	4	1
4	foo	2	4	1	5	2
5	bar	2	1	2	6	3
6	foo	1	2	4	7	4
7	foo	3	3	4	8	4

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
0	5.1	3.5	1.4	0.2	0
1	4.9	3.0	1.4	0.2	0
2	4.7	3.2	1.3	0.2	0
3	4.6	3.1	1.5	0.2	0
4	5.0	3.6	1.4	0.2	0
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	2
146	6.3	2.5	5.0	1.9	2
147	6.5	3.0	5.2	2.0	2
148	6.2	3.4	5.4	2.3	2
149	5.9	3.0	5.1	1.8	2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
7	5.0	3.4	1.5	NaN	0
8	NaN	2.9	1.4	0.2	0
39	5.1	3.4	NaN	0.2	0
71	6.1	2.8	4.0	NaN	1
75	6.6	NaN	4.4	1.4	1
86	6.7	3.1	NaN	1.5	1
115	6.4	NaN	5.3	2.3	2
137	NaN	3.1	5.5	1.8	2

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
1	4.9	3.0	1.4	0.2	0
2	4.7	3.2	1.3	0.2	0
3	4.6	3.1	1.5	0.2	0
4	5.0	3.6	1.4	0.2	0
5	5.4	3.9	1.7	0.4	0
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	2
146	6.3	2.5	5.0	1.9	2
147	6.5	3.0	5.2	2.0	2
148	6.2	3.4	5.4	2.3	2
149	5.9	3.0	5.1	1.8	2

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
102	7.1	3.0	5.9	2.1	2
105	7.6	3.0	6.6	2.1	2
107	7.3	2.9	6.3	1.8	2
109	7.2	3.6	6.1	2.5	2
117	7.7	3.8	6.7	2.2	2
118	7.7	2.6	6.9	2.3	2
122	7.7	2.8	6.7	2.0	2
125	7.2	3.2	6.0	1.8	2
129	7.2	3.0	5.8	1.6	2
130	7.4	2.8	6.1	1.9	2
131	7.9	3.8	6.4	2.0	2
135	7.7	3.0	6.1	2.3	2

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
107	7.3	2.9	6.3	1.8	2
118	7.7	2.6	6.9	2.3	2
122	7.7	2.8	6.7	2.0	2
130	7.4	2.8	6.1	1.9	2

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
100	6.3	3.3	6.0	2.5	2
101	5.8	2.7	5.1	1.9	2
102	7.1	3.0	5.9	2.1	2
103	6.3	2.9	5.6	1.8	2

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
118	7.7	2.6	6.9	2.3	2
117	7.7	3.8	6.7	2.2	2
122	7.7	2.8	6.7	2.0	2
105	7.6	3.0	6.6	2.1	2
131	7.9	3.8	6.4	2.0	2
107	7.3	2.9	6.3	1.8	2
109	7.2	3.6	6.1	2.5	2
130	7.4	2.8	6.1	1.9	2
135	7.7	3.0	6.1	2.3	2
125	7.2	3.2	6.0	1.8	2
102	7.1	3.0	5.9	2.1	2
129	7.2	3.0	5.8	1.6	2

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
0	7.9	3.8	6.4	2.0	2
1	7.7	3.0	6.1	2.3	2
2	7.7	2.8	6.7	2.0	2
3	7.7	3.8	6.7	2.2	2
4	7.7	2.6	6.9	2.3	2
...	...	...	...	...	...
144	4.6	3.6	1.0	0.2	0
145	4.5	2.3	1.3	0.3	0
146	4.4	3.0	1.3	0.2	0
147	4.4	3.2	1.3	0.2	0
148	4.3	3.0	1.1	0.1	0

	A	B	C	D	E	F
0	foo	1	1	2	1	2
1	bar	1	3	4	2	3
2	foo	2	2	5	3	4
3	bar	3	5	5	4	1
4	foo	2	4	1	5	2
5	bar	2	1	2	6	3
6	foo	1	2	4	7	4
7	foo	3	3	4	8	4

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
0	5.1	3.5	1.4	0.2	0
1	4.9	3.0	1.4	0.2	0
2	4.7	3.2	1.3	0.2	0
3	4.6	3.1	1.5	0.2	0
4	5.0	3.6	1.4	0.2	0
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	2
146	6.3	2.5	5.0	1.9	2
147	6.5	3.0	5.2	2.0	2
148	6.2	3.4	5.4	2.3	2
149	5.9	3.0	5.1	1.8	2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
7	5.0	3.4	1.5	NaN	0
8	NaN	2.9	1.4	0.2	0
39	5.1	3.4	NaN	0.2	0
71	6.1	2.8	4.0	NaN	1
75	6.6	NaN	4.4	1.4	1
86	6.7	3.1	NaN	1.5	1
115	6.4	NaN	5.3	2.3	2
137	NaN	3.1	5.5	1.8	2

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
1	4.9	3.0	1.4	0.2	0
2	4.7	3.2	1.3	0.2	0
3	4.6	3.1	1.5	0.2	0
4	5.0	3.6	1.4	0.2	0
5	5.4	3.9	1.7	0.4	0
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	2
146	6.3	2.5	5.0	1.9	2
147	6.5	3.0	5.2	2.0	2
148	6.2	3.4	5.4	2.3	2
149	5.9	3.0	5.1	1.8	2

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
102	7.1	3.0	5.9	2.1	2
105	7.6	3.0	6.6	2.1	2
107	7.3	2.9	6.3	1.8	2
109	7.2	3.6	6.1	2.5	2
117	7.7	3.8	6.7	2.2	2
118	7.7	2.6	6.9	2.3	2
122	7.7	2.8	6.7	2.0	2
125	7.2	3.2	6.0	1.8	2
129	7.2	3.0	5.8	1.6	2
130	7.4	2.8	6.1	1.9	2
131	7.9	3.8	6.4	2.0	2
135	7.7	3.0	6.1	2.3	2

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
107	7.3	2.9	6.3	1.8	2
118	7.7	2.6	6.9	2.3	2
122	7.7	2.8	6.7	2.0	2
130	7.4	2.8	6.1	1.9	2

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
100	6.3	3.3	6.0	2.5	2
101	5.8	2.7	5.1	1.9	2
102	7.1	3.0	5.9	2.1	2
103	6.3	2.9	5.6	1.8	2

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
118	7.7	2.6	6.9	2.3	2
117	7.7	3.8	6.7	2.2	2
122	7.7	2.8	6.7	2.0	2
105	7.6	3.0	6.6	2.1	2
131	7.9	3.8	6.4	2.0	2
107	7.3	2.9	6.3	1.8	2
109	7.2	3.6	6.1	2.5	2
130	7.4	2.8	6.1	1.9	2
135	7.7	3.0	6.1	2.3	2
125	7.2	3.2	6.0	1.8	2
102	7.1	3.0	5.9	2.1	2
129	7.2	3.0	5.8	1.6	2

	花萼长度	花萼宽度	花瓣长度	花瓣宽度	类别
0	7.9	3.8	6.4	2.0	2
1	7.7	3.0	6.1	2.3	2
2	7.7	2.8	6.7	2.0	2
3	7.7	3.8	6.7	2.2	2
4	7.7	2.6	6.9	2.3	2
...	...	...	...	...	...
144	4.6	3.6	1.0	0.2	0
145	4.5	2.3	1.3	0.3	0
146	4.4	3.0	1.3	0.2	0
147	4.4	3.2	1.3	0.2	0
148	4.3	3.0	1.1	0.1	0

	A	B	C	D	E	F
0	foo	1	1	2	1	2
1	bar	1	3	4	2	3
2	foo	2	2	5	3	4
3	bar	3	5	5	4	1
4	foo	2	4	1	5	2
5	bar	2	1	2	6	3
6	foo	1	2	4	7	4
7	foo	3	3	4	8	4