python正则表达式 re.sub进行数据的清洗
1、替换多余字符
users[i]=users[i].apply(lambda x:re.sub('外勤','',str(x)))
users[i] = users[i].apply(lambda x: re.sub('\n', '', str(x)))
users[i] = users[i].apply(lambda x: re.sub(' ', '', str(x)))
users[i] = users[i].apply(lambda x: re.sub('nan', '18:00', str(x)))
users[i] = users[i].apply(lambda x: re.sub('(离职)', '', str(x)))
处理前:
处理后:
项目:计算加班时长代码(超过18:00的加班的总时长)
# -*- coding: UTF-8 -*-co
import pandas as pd
import numpy as np
import re
users=pd.read_excel('F://1.xlsx',header=2,encoding='utf-8')
users.drop(['考勤组','部门','职位','UserId'],axis=1,inplace=True)
for i in (users.columns):
users[i]=users[i].apply(lambda x:re.sub('外勤','',str(x)))
users[i] = users[i].apply(lambda x: re.sub('\n', '', str(x)))
users[i] = users[i].apply(lambda x: re.sub(' ', '', str(x)))
users[i] = users[i].apply(lambda x: re.sub('nan', '18:00', str(x)))
users[i] = users[i].apply(lambda x: re.sub('(离职)', '', str(x)))
for j in range(len(users.columns)):
for i in range(len(users)):
f=j+2
if f<=len(users.columns)-1:
users.iloc[i,f]=users.iloc[i,f][-5:]
else:
break
for j in range(len(users.columns)):
for i in range(len(users)):
f = j + 2
if f <= (len(users.columns)-1):
users.iloc[i, f] = int(users.iloc[i,f][:2])+int(users.iloc[i,f][-2:])/60
else:
break
for j in range(len(users.columns)):
for i in range(len(users)):
f=j+2
if f <= len(users.columns) - 1:
if users.iloc[i,f]>=18 and users.iloc[i,f]<24:
users.iloc[i,f] =users.iloc[i,f]-18
elif users.iloc[i,f]<=6 and 0 <= users.iloc[i,f]:
users.iloc[i, f] = users.iloc[i, f] + 6
else:
users.iloc[i, f] = 0
else:
break
users['sum']=users['3']*0.0
for i in range(len(users)):
sum=0
for j in range(len(users.columns)):
f=j+2
if f <= len(users.columns) - 1:
sum=sum+users.iloc[i, f]
else:
break
users['sum'][i]=sum
users.to_excel('F://2.xlsx',index=None)
结果算出每一个人的加班时长