20170410
计算时间,去重,筛选数据
#!/usr/bin/env python # -*- coding: utf-8 -*- #计算时间,筛选数据,删除数据 # import pandas as pd from dateutil.parser import parse import numpy as np dateparse = lambda dates: pd.datetime.strptime(dates,'%Y-%m-%d') data = pd.read_csv(r'E:\7 Python\data\t.csv', dtype={'ID':object,'COLL_OBJ_ID':object,'EQUIP_ID':object,'FAULT_TYPE':object, 'SYNC_ORG_NO':object, 'ORG_NO':object, 'ORG_NAME':object, 'SORT_CODE':object, 'SPEC_CODE':object, 'COMM_MODE': object,'ARRIVE_BATCH_NO':object,'MANUFACTURER':object}, date_parser=dateparse) print data.head(5) print data.info() data.dropna(how='all') data['ORG_NO1'] = [x[:5] for x in data['ORG_NO'].values.astype('str')] data['FAULT_TYPE'] = data['FAULT_TYPE'].str.strip() data['SORT_CODE'] = data['SORT_CODE'].str.strip() #时间处理 data['FAULT_DATE1'] = pd.to_datetime(data['FAULT_DATE'].str.strip().str.split(' ').str[0]) data['INST_DATE1'] = pd.to_datetime(data['INST_DATE'].str.strip().str.split(' ').str[0]) data['DETECT_DATE1'] = pd.to_datetime(data['DETECT_DATE'].str.strip().str.split(' ').str[0], errors = 'coerce') data['APP_DATE1'] = pd.to_datetime(data['APP_DATE'].str.strip().str.split(' ').str[0], errors = 'coerce') data['RUN_DATE1'] = pd.to_datetime(data['RUN_DATE'].str.strip().str.split(' ').str[0], errors = 'coerce') data['BUILD_DATE1'] = pd.to_datetime(data['BUILD_DATE'].str.strip().str.split(' ').str[0], errors = 'coerce') data['PS_DATE1'] = pd.to_datetime(data['PS_DATE'].str.strip().str.split(' ').str[0], errors = 'coerce') data['LAST_CHK_DATE1'] = pd.to_datetime(data['LAST_CHK_DATE'].str.strip().str.split(' ').str[0], errors = 'coerce') # 提取月份 data['FAULT_MONTH'] = [x.month for x in data['FAULT_DATE1']] data['INST_MONTH'] = [x.month for x in data['INST_DATE1']] data['PS_MONTH'] = [x.month for x in data['PS_DATE1']] # 计算使用时间 print sum(data['FAULT_DATE1'].isnull()), sum(data['INST_DATE1'].isnull()) data['work_days'] = data['FAULT_DATE1'] - data['INST_DATE1'] data['work_months'] = [x.days/30 if not pd.isnull(x) else np.nan for x in data['work_days']] #计算库存时间 data['save_days'] = data['INST_DATE1'] - data['DETECT_DATE1'] data['save_months'] = [x.days/30 if not pd.isnull(x) else np.nan for x in data['save_days']] print data['work_months'].value_counts() print data['save_months'].value_counts() # #随机取正态分布补全使用时间 # month_mean = data['work_months'].mean() # month_std = data['work_months'].std() # count_nan_month = data['work_months'].isnull().sum() # count_0_month = len(data['work_months'][data['work_months']<0]) # rand_1 = np.random.randint(month_mean - month_std, month_mean + month_std, size = count_nan_month + count_0_month) # data['work_months'][(data['work_months'].isnull()) | (data['work_months']<0)] = rand_1#这里有错 # #随机取正态分布补全库存时间 # day_mean = data['save_months'].mean() # day_std = data['save_months'].std() # count_nan_day = data['save_months'].isnull().sum() # count_0_day = len(data['save_day'][data['save_day']<0]) # rand_2 = np.random.randint(day_mean - day_std, day_mean + day_std, size = count_nan_day) # data['save_months'][(data['save_months'].isnull()) | (data['work_month']<0)] = rand_2 # #均值补全库存和使用时间 # work_mean = data['work_days'].mean() # data['work_days']=data['work_days'].fillna(work_mean) # day_mean = data['save_days'].mean() # data['save_days']=data['save_days'].fillna(day_mean) print data print data['FAULT_TYPE'].value_counts() #筛选数据 data = data[data['SORT_CODE']=='10'] #选出SORT_CODE 为10 的智能表 data['FAULT_TYPE_1'] = [x[0] for x in data['FAULT_TYPE'].values.astype('str')] data['FAULT_TYPE_3'] = [x[0:3] for x in data['FAULT_TYPE'].values.astype('str')] print data data = data[data['FAULT_TYPE_1']=='4'] data = data[(data['FAULT_TYPE_3']!='412')&(data['FAULT_TYPE']!='4')] data['FAULT_TYPE'].value_counts() #删除数据 data.drop([ 'FAULT_DATE', 'INST_DATE','DETECT_DATE','VAR00001', 'APP_DATE','RUN_DATE','BUILD_DATE','PS_DATE', 'LAST_CHK_DATE','ORG_NO', 'FAULT_DATE1', 'INST_DATE1', 'DETECT_DATE1','APP_DATE1','RUN_DATE1', 'BUILD_DATE1','PS_DATE1','LAST_CHK_DATE1', 'ID','COLL_OBJ_ID','MP_ID','SORT_CODE', 'CT','PT','PAP_R','SP_ID','TARIFF_ID','MR_SECT_NO', 'LINE_ID','TG_ID','CONS_ID','CUST_ID','CONS_NO', 'CUST_QUERY_NO','ORGN_CONS_NO','CONS_SORT_CODE','CUST_ORG_NO', 'CHECKER_NO','CUST_MR_SECT_NO','ORG_NO1', 'work_days','save_days','FAULT_TYPE_1'], axis=1, inplace=True) print data print data['FAULT_TYPE_3'].value_counts() data.to_csv(r'E:\7 Python\data\t1.csv', index=False)