时间类型数据预处理:
0、时间格式转换
dateframe 日期数据,字符型转换成日期格式:
pd.to_datetime(dateframe_timedate,format='%Y/%m/%d %H:%M:%S')
常见的报错检查点:
- 格式不匹配,检查原始数据中的日期格式,‘2019/01/01 00:00:00’只能转换成format=’%Y/%m/%d %H:%M:%S’。同理,‘2019-01-01 00:00:00’只能转换成format=’%Y-%m-%d %H:%M:%S’。
在excel或者csv中,可以通过设置单元格格式——自定义格式,统一时间字符串的格式。或者调整format=’%Y/%m/%d %H:%M:%S’。 - 日期超出python中日期范围,检查原始数据中日期,通常是日期填写错误。
1、异常值处理
时间内容填写错误置空
np.where(条件,x,y)#条件正确取值为x,条件错误取值为y。
np.where()可见np.where()用法
TFS_data_origin['Date_of_submission'] = np.where((len(TFS_data_origin['Date_of_submission'])<19)|\
(TFS_data_origin['Date_of_submission']=='0001/01/01 08:00:00'),\
None, TFS_data_origin['Date_of_submission'])
时间之间逻辑错误值置空
多个日期比较前,先进行时间格式转换,字符型时间相互比较会有问题。
np.where(条件,x,y)#条件正确取值为x,条件错误取值为y。
TFS_data_origin['Delivered_date'] = np.where((TFS_data_origin['Delivered_date']<TFS_data_origin['Date_of_submission'])|\
(TFS_data_origin['Delivered_date']<TFS_data_origin['Adopted_date'])|\
(TFS_data_origin['Delivered_date']<TFS_data_origin['Delivery_date']) ,\
None, TFS_data_origin['Delivered_date'])
2、两个日期间隔天数计算
多个日期计算前,先进行时间格式转换。
dataframe格式日期天数计算参见另一篇博文dataframe计算两个日期之间天数 两种思路(推荐思路2),这里采用的是思路2.
FS_data_origin['Req_delivery_cycle'] = (TFS_data_origin['Delivered_date'] - TFS_data_origin['Date_of_submission']).dt.days + 1
3、计算周期统计量
group by 结合describe()或者聚合函数sum()等。
dateframe中多种数据类型,可以选择其中的某几列进行统计。
dateframe某一列中多种类型,可以强制转换成数值型再进行统计。
TFS_data[['Req_delivery_cycle','Req_delivery_waiting_cycle','Req_RD_cycle',\
'Req_plan_cycle']]=TFS_data[['Req_delivery_cycle','Req_delivery_waiting_cycle',\
'Req_RD_cycle','Req_plan_cycle']].astype('float')
TFS_statistic = TFS_data.groupby("project_name").describe(percentiles=[.5, .8])
栗子
'''
2010-01-08
主题:python dataframe时间类型数据预处理以及周期统计分析
Dandan
'''
import pandas as pd
import numpy as np
import time, datetime
from datetime import datetime
#导入数据
#encoding = ‘gbk’ windows下中文的编码
TFS_data_origin = pd.read_csv("data_input/timedata.csv",encoding = 'gbk')
#原始数据预览
print('*'*10+'TFS_data_origin shape'+'*'*10)
TFS_data_origin.describe()
print(TFS_data_origin.shape)
#数据预处理
#TFS数据预处理
#日期填写率统计
TFS_data_origin.isnull().any()
print(TFS_data_origin.isnull().sum())
#1-日期异常值处理
#日期填写内容错误,比如“0000-01-01 00:00:00”,日期长度格式填写错误,比如'2019-01-01 00:00'
TFS_data_origin['Date_of_submission'] = np.where((len(TFS_data_origin['Date_of_submission'])<19)|\
(TFS_data_origin['Date_of_submission']=='0001/01/01 08:00:00'),\
None, TFS_data_origin['Date_of_submission'])
TFS_data_origin['Adopted_date'] = np.where((TFS_data_origin['Adopted_date']<'2019/01/01 00:00:00')|\
(len(TFS_data_origin['Adopted_date'])<19)|\
(TFS_data_origin['Date_of_submission']=='0001/01/01 08:00:00'),\
None, TFS_data_origin['Adopted_date'])
TFS_data_origin['Delivery_date'] = np.where((TFS_data_origin['Delivery_date']<'2019/01/01 00:00:00')|\
(len(TFS_data_origin['Delivery_date'])<19)|\
(TFS_data_origin['Date_of_submission']=='0001/01/01 08:00:00'),\
None, TFS_data_origin['Delivery_date'])
TFS_data_origin['Delivered_date'] = np.where((TFS_data_origin['Delivered_date']<'2019/01/01 00:00:00')|\
(len(TFS_data_origin['Delivered_date'])<19)|\
(TFS_data_origin['Date_of_submission']=='0001/01/01 08:00:00'),\
None, TFS_data_origin['Delivered_date'])
#日期字符串日期化,字符串转日期格式,以便进行日期比较。字符型日期比较会出错。'2020/1/9'>'2020/1/13'
TFS_data_origin['Date_of_submission'] = pd.to_datetime(TFS_data_origin['Date_of_submission'],format='%Y/%m/%d %H:%M:%S')
TFS_data_origin['Adopted_date'] = pd.to_datetime(TFS_data_origin['Adopted_date'],format='%Y/%m/%d %H:%M:%S')
TFS_data_origin['Delivery_date'] = pd.to_datetime(TFS_data_origin['Delivery_date'],format='%Y/%m/%d %H:%M:%S')
TFS_data_origin['Delivered_date'] = pd.to_datetime(TFS_data_origin['Delivered_date'],format='%Y/%m/%d %H:%M:%S')
#日期之间关系异常,理论上'Date_of_submission'<='Adopted_date'<='Delivery_date'<='Delivered_date'
TFS_data_origin['Delivered_date'] = np.where((TFS_data_origin['Delivered_date']<TFS_data_origin['Date_of_submission'])|\
(TFS_data_origin['Delivered_date']<TFS_data_origin['Adopted_date'])|\
(TFS_data_origin['Delivered_date']<TFS_data_origin['Delivery_date']) ,\
None, TFS_data_origin['Delivered_date'])
TFS_data_origin['Delivery_date'] = np.where((TFS_data_origin['Delivery_date']<TFS_data_origin['Date_of_submission'])|\
(TFS_data_origin['Delivery_date']<TFS_data_origin['Adopted_date']),\
None, TFS_data_origin['Delivery_date'])
TFS_data_origin['Adopted_date'] = np.where((TFS_data_origin['Adopted_date']<TFS_data_origin['Date_of_submission']),\
None, TFS_data_origin['Adopted_date'])
# TFS_data_origin.describe()
print('*'*20)
TFS_data_origin.isnull().any()
print(TFS_data_origin.isnull().sum())
#日期字符串日期化,字符串转日期格式,以便进行日期比较。字符型日期会存在问题。
TFS_data_origin['Date_of_submission'] = pd.to_datetime(TFS_data_origin['Date_of_submission'],format='%Y/%m/%d %H:%M:%S')
TFS_data_origin['Adopted_date'] = pd.to_datetime(TFS_data_origin['Adopted_date'],format='%Y/%m/%d %H:%M:%S')
TFS_data_origin['Delivery_date'] = pd.to_datetime(TFS_data_origin['Delivery_date'],format='%Y/%m/%d %H:%M:%S')
TFS_data_origin['Delivered_date'] = pd.to_datetime(TFS_data_origin['Delivered_date'],format='%Y/%m/%d %H:%M:%S')
'''
计算周期,没有考虑状态!
TFS_data_origin['Req_delivery_cycle'] = (TFS_data_origin['Delivered_date'] - TFS_data_origin['Date_of_submission']).dt.days + 1
print(TFS_data_origin['Req_delivery_cycle'].isnull().sum())
print(TFS_data_origin['Req_delivery_cycle'].mean())
TFS_data_origin['Req_RD_cycle'] = (TFS_data_origin['Delivery_date'] - TFS_data_origin['Adopted_date']).dt.days + 1
print(TFS_data_origin['Req_RD_cycle'].isnull().sum())
print(TFS_data_origin['Req_RD_cycle'].mean())
TFS_data_origin['Req_delivery_waiting_cycle'] = (TFS_data_origin['Delivered_date'] - TFS_data_origin['Delivery_date']).dt.days + 1
print(TFS_data_origin['Req_delivery_waiting_cycle'].isnull().sum())
print(TFS_data_origin['Req_delivery_waiting_cycle'].mean())
TFS_data_origin['Req_plan_cycle'] = (TFS_data_origin['Adopted_date'] - TFS_data_origin['Date_of_submission']).dt.days + 1
print(TFS_data_origin['Req_plan_cycle'].isnull().sum())
print(TFS_data_origin['Req_plan_cycle'].mean())
'''
#特定状态下计算周期,其余状态置空。
TFS_data_origin['Req_delivery_cycle'] = np.where(TFS_data_origin['standard_state'].map(lambda x:x=='已交付'),\
((TFS_data_origin['Delivered_date'] - TFS_data_origin['Date_of_submission']).dt.days + 1),\
None)
TFS_data_origin['Req_delivery_waiting_cycle'] = np.where(TFS_data_origin['standard_state'].map(lambda x:x=='已交付'),\
((TFS_data_origin['Delivered_date'] - TFS_data_origin['Delivery_date']).dt.days + 1),\
None)
TFS_data_origin['Req_RD_cycle'] = np.where(TFS_data_origin['standard_state'].map(lambda x:x in ('已交付','可交付')),\
((TFS_data_origin['Delivery_date'] - TFS_data_origin['Adopted_date']).dt.days + 1),\
None)
TFS_data_origin['Req_plan_cycle'] = np.where(TFS_data_origin['standard_state'].map(lambda x:x in ('已采纳','已认领'),\ ((TFS_data_origin['Adopted_date'] - TFS_data_origin['Date_of_submission']).dt.days + 1),\
None)
print(TFS_data_origin['Req_delivery_cycle'], TFS_data_origin['Req_delivery_cycle'])
#选取大于等于1的有效需求交付周期数据
TFS_data = TFS_data_origin[(TFS_data_origin['Req_delivery_cycle']>=1)]
TFS_data.describe()
outputpath0='data_output/TFS_data_new.csv'
TFS_data.to_csv(outputpath0,sep=',',index=True,header=True)
#将周期列转换为数值型数据,方便进行分组统计
TFS_data[['Req_delivery_cycle','Req_delivery_waiting_cycle','Req_RD_cycle',\
'Req_plan_cycle']]=TFS_data[['Req_delivery_cycle','Req_delivery_waiting_cycle',\
'Req_RD_cycle','Req_plan_cycle']].astype('float')
#数据分组统计
#percentiles=[.5, .8] 增加80%分位数:
TFS_statistic = TFS_data.groupby("project_name").describe(percentiles=[.5, .8])
print('*'*10+'TFS_statistic shape'+'*'*10)
print(TFS_statistic.shape)
print('*'*20)
outputpath1='data_output/TFS_statistics_new.csv'
TFS_statistic.to_csv(outputpath1,sep=',',index=True,header=True)