日期格式数据处理
- Pandas中使用to_datetime()方法将文本格式转换为日期格式
- dataframe数据类型如果为datetime64,可以使用dt方法取出年月日等
- 对于时间差数据,可以使用timedelta函数将其转换为指定时间单位的数值
- 时间差数据,可以使用dt方法访问其常用属性
import pandas as pd
import numpy as np
import os
os.getcwd()
'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据转换'
os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')
df = pd.read_csv('baby_trade_history.csv', encoding='utf-8', dtype={'user_id':str})
df.head(5)
| user_id | auction_id | cat_id | cat1 | property | buy_mount | day |
---|
0 | 786295544 | 41098319944 | 50014866 | 50022520 | 21458:86755362;13023209:3593274;10984217:21985... | 2 | 20140919 |
---|
1 | 532110457 | 17916191097 | 50011993 | 28 | 21458:11399317;1628862:3251296;21475:137325;16... | 1 | 20131011 |
---|
2 | 249013725 | 21896936223 | 50012461 | 50014815 | 21458:30992;1628665:92012;1628665:3233938;1628... | 1 | 20131011 |
---|
3 | 917056007 | 12515996043 | 50018831 | 50014815 | 21458:15841995;21956:3494076;27000458:59723383... | 2 | 20141023 |
---|
4 | 444069173 | 20487688075 | 50013636 | 50008168 | 21458:30992;13658074:3323064;1628665:3233941;1... | 1 | 20141103 |
---|
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29971 entries, 0 to 29970
Data columns (total 7 columns):
user_id 29971 non-null object
auction_id 29971 non-null int64
cat_id 29971 non-null int64
cat1 29971 non-null int64
property 29827 non-null object
buy_mount 29971 non-null int64
day 29971 non-null int64
dtypes: int64(5), object(2)
memory usage: 1.6+ MB
df['buy_date'] = pd.to_datetime(df['day'], format='%Y%m%d', errors='coerce')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29971 entries, 0 to 29970
Data columns (total 8 columns):
user_id 29971 non-null object
auction_id 29971 non-null int64
cat_id 29971 non-null int64
cat1 29971 non-null int64
property 29827 non-null object
buy_mount 29971 non-null int64
day 29971 non-null int64
buy_date 29971 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(5), object(2)
memory usage: 1.8+ MB
df.head(5)
| user_id | auction_id | cat_id | cat1 | property | buy_mount | day | buy_date |
---|
0 | 786295544 | 41098319944 | 50014866 | 50022520 | 21458:86755362;13023209:3593274;10984217:21985... | 2 | 20140919 | 2014-09-19 |
---|
1 | 532110457 | 17916191097 | 50011993 | 28 | 21458:11399317;1628862:3251296;21475:137325;16... | 1 | 20131011 | 2013-10-11 |
---|
2 | 249013725 | 21896936223 | 50012461 | 50014815 | 21458:30992;1628665:92012;1628665:3233938;1628... | 1 | 20131011 | 2013-10-11 |
---|
3 | 917056007 | 12515996043 | 50018831 | 50014815 | 21458:15841995;21956:3494076;27000458:59723383... | 2 | 20141023 | 2014-10-23 |
---|
4 | 444069173 | 20487688075 | 50013636 | 50008168 | 21458:30992;13658074:3323064;1628665:3233941;1... | 1 | 20141103 | 2014-11-03 |
---|
df['diff_day'] = pd.datetime.now() - df['buy_date']
df.head(5)
| user_id | auction_id | cat_id | cat1 | property | buy_mount | day | buy_date | diff_day |
---|
0 | 786295544 | 41098319944 | 50014866 | 50022520 | 21458:86755362;13023209:3593274;10984217:21985... | 2 | 20140919 | 2014-09-19 | 2034 days 22:32:35.614788 |
---|
1 | 532110457 | 17916191097 | 50011993 | 28 | 21458:11399317;1628862:3251296;21475:137325;16... | 1 | 20131011 | 2013-10-11 | 2377 days 22:32:35.614788 |
---|
2 | 249013725 | 21896936223 | 50012461 | 50014815 | 21458:30992;1628665:92012;1628665:3233938;1628... | 1 | 20131011 | 2013-10-11 | 2377 days 22:32:35.614788 |
---|
3 | 917056007 | 12515996043 | 50018831 | 50014815 | 21458:15841995;21956:3494076;27000458:59723383... | 2 | 20141023 | 2014-10-23 | 2000 days 22:32:35.614788 |
---|
4 | 444069173 | 20487688075 | 50013636 | 50008168 | 21458:30992;13658074:3323064;1628665:3233941;1... | 1 | 20141103 | 2014-11-03 | 1989 days 22:32:35.614788 |
---|
df.dtypes
user_id object
auction_id int64
cat_id int64
cat1 int64
property object
buy_mount int64
day int64
buy_date datetime64[ns]
diff_day timedelta64[ns]
dtype: object
df['时间差'] = df['diff_day']/pd.Timedelta('1 D')
df['时间差'].head(5)
0 2034.939301
1 2377.939301
2 2377.939301
3 2000.939301
4 1989.939301
Name: 时间差, dtype: float64
df['时间差'] = df['diff_day']/pd.Timedelta('1 H')
df['时间差'].head(5)
0 48838.543226
1 57070.543226
2 57070.543226
3 48022.543226
4 47758.543226
Name: 时间差, dtype: float64
df['时间差'] = df['diff_day']/pd.Timedelta('1 M')
df['时间差'].head(5)
0 2.930313e+06
1 3.424233e+06
2 3.424233e+06
3 2.881353e+06
4 2.865513e+06
Name: 时间差, dtype: float64
df['时间差'].head(5).round(decimals=3)
0 2930312.594
1 3424232.594
2 3424232.594
3 2881352.594
4 2865512.594
Name: 时间差, dtype: float64
df['diff_day'].astype('timedelta64[D]').head(5)
0 2034.0
1 2377.0
2 2377.0
3 2000.0
4 1989.0
Name: diff_day, dtype: float64
df['diff_day'].astype('timedelta64[M]').head(5)
0 66.0
1 78.0
2 78.0
3 65.0
4 65.0
Name: diff_day, dtype: float64
df['diff_day'].astype('timedelta64[Y]').head(5)
0 5.0
1 6.0
2 6.0
3 5.0
4 5.0
Name: diff_day, dtype: float64
df['diff_day'].astype('timedelta64[h]').head(5)
0 48838.0
1 57070.0
2 57070.0
3 48022.0
4 47758.0
Name: diff_day, dtype: float64