python标准日期正则_基于Python正则表达式的正文日期识别算法

根据我的上一篇博客:https://www.imooc.com/search/article?words=迷之语法

我又写了一个简单应用来识别并提取一些文章或者通知的重要日期,目前调试结果还行,但是还存在一些漏洞,比如同时识别好几个日期,还没加条件来判断哪个更重要,阅者可以发现并根据需要来完善。

完整代码:

import re

import pandas as pd

pattern1 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]*[上中下午晚早上]*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+(\d+分)*)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))(--|-次日|-|至次日|至)*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+(\d+分)*)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')

pattern16 = re.compile(r'(\d+年+)*(\d+月+)*(\d+日+)*[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]+[上中下午晚早上]*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+(\d+分)*)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))(--|-次日|-|至次日|至)*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+(\d+分)*)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')

pattern2 = re.compile(r'[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]+(\d+年+)*(\d+月+)*(\d+日+)*[上中下午晚早上]*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))(--|-次日|-|至次日|至)*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')

pattern3 = re.compile(r'(\d+年)*(\d+月)*(\d+日)*[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]+([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*(-)*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*')

pattern4 = re.compile(r'(\d+\-\d+\-\d+)+(.)*[上中下午晚早上]*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*(-)*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*')

pattern5 = re.compile(r'(\d+年+)*(\d+月+)*(\d+日+)*(下班前|之前|前)+[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]+(下班前|之前|前)+')

pattern6 = re.compile(r'(\d+年+)*(\d+月+)*(\d+日+)*(下班前|之前|前)*[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]+[上中下午晚早上]*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二点]+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')

pattern7 = re.compile(r'(\d+年+)*(\d+月+)*(\d+日+)*[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]+[上中下午晚早上]*(\d+(点半|点)+)+(\d+分)+')

pattern8 = re.compile(r'(\d+年+)*(\d+月+)*(\d+日+)*[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]+[\(]*(([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*(-|至)*(([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*[\)]*')

pattern9 = re.compile(r'[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]*(\d+年+)*(\d+月+)+(\d+日+)+[上中下午晚早上]+(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))(--|-次日|-|至次日|至)*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')

pattern10 = re.compile(r'(\d+年)*(\d+月)+(\d+日)+[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*(-)*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*')

pattern11 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+(.)+[上中下午晚早上]*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*(-)*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*')

pattern12 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+(下班前|之前|前)+[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]*(下班前|之前|前)+')

pattern18 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+(下班前|之前|前)+[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]*(下班前|之前|前)*')

pattern19 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+(下班前|之前|前)*[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]*(下班前|之前|前)+')

pattern13 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+(下班前|之前|前)+[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]*[上中下午晚早上]*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二点]+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')

pattern20 = re.compile(r'[明后天]+(\()+(\d+月+)+(\d+日+)+(\))+(全天)+')

pattern14 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]*[上中下午晚早上]*(\d+(点半|点)+)+(\d+分)+')

pattern15 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]*[\(]*(下班前|之前|前)+')

pattern17 = re.compile(r'(\d+年+)*(\d+月+)*(\d+日+)*[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]*[上中下午晚早上]+(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))(--|-次日|-|至次日|至)*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')

path = "E:/NLP/DataSet/Data"

df1 = pd.read_excel(r'E:/NLP/DataSet/Data/data.xlsx',sheet_name = 0)

df = df1.iloc[:,0]

re_time = []

for i in range(len(df)):

lines = str(df[i]).split("于")

result = []

for line in lines:

a1 = line

m1 = pattern1.search(a1)

m2 = pattern2.search(a1)

m3 = pattern3.search(a1)

m4 = pattern4.search(a1)

m5 = pattern5.search(a1)

m6 = pattern6.search(a1)

m7 = pattern7.search(a1)

m8 = pattern8.search(a1)

m9 = pattern9.search(a1)

m10 = pattern10.search(a1)

m11 = pattern11.search(a1)

m12 = pattern12.search(a1)

m13 = pattern13.search(a1)

m14 = pattern14.search(a1)

m15 = pattern15.search(a1)

m16 = pattern16.search(a1)

m17 = pattern17.search(a1)

m18 = pattern18.search(a1)

m19 = pattern19.search(a1)

m20 = pattern20.search(a1)

#按照复杂度,对正则表达式的判断顺序排了个序

#顺序越靠前,筛选条件越苛刻

if m4:

print(m4.group(0))

b = re.search('\d',m4.group(0))

if b:

result.append(m4.group(0))

elif m5:

print(m5.group(0))

b = re.search('\d',m5.group(0))

if b:

result.append(m5.group(0))

elif m7:

print(m7.group(0))

b = re.search('\d',m7.group(0))

if b:

result.append(m7.group(0))

elif m1:

print(m1.group(0))

b = re.search('\d',m1.group(0))

if b:

result.append(m1.group(0))

elif m16:

print(m16.group(0))

b = re.search('\d',m16.group(0))

if b:

result.append(m16.group(0))

elif m2:

print(m2.group(0))

b = re.search('\d',m2.group(0))

if b:

result.append(m2.group(0))

elif m3:

print(m3.group(0))

b = re.search('\d',m3.group(0))

if b:

result.append(m3.group(0))

elif m6:

print(m6.group(0))

b = re.search('\d',m6.group(0))

if b:

result.append(m6.group(0))

elif m8:

print(m8.group(0))

b = re.search('\d',m8.group(0))

if b:

result.append(m8.group(0))

elif m9:

print(m9.group(0))

b = re.search('\d',m9.group(0))

if b:

result.append(m9.group(0))

elif m10:

print(m10.group(0))

b = re.search('\d',m10.group(0))

if b:

result.append(m10.group(0))

elif m12:

print(m12.group(0))

b = re.search('\d',m12.group(0))

if b:

result.append(m12.group(0))

elif m18:

print(m18.group(0))

b = re.search('\d',m18.group(0))

if b:

result.append(m18.group(0))

elif m19:

print(m19.group(0))

b = re.search('\d',m19.group(0))

if b:

result.append(m19.group(0))

elif m13:

print(m13.group(0))

b = re.search('\d',m13.group(0))

if b:

result.append(m13.group(0))

elif m20:

print(m20.group(0))

b = re.search('\d',m20.group(0))

if b:

result.append(m20.group(0))

elif m14:

print(m14.group(0))

b = re.search('\d',m14.group(0))

if b:

result.append(m14.group(0))

elif m15:

print(m15.group(0))

b = re.search('\d',m15.group(0))

if b:

result.append(m15.group(0))

elif m17:

print(m17.group(0))

b = re.search('\d',m17.group(0))

if b:

result.append(m17.group(0))

elif m11:

print(m11.group(0))

b = re.search('\d',m11.group(0))

if b:

result.append(m11.group(0))

#当列表有不止一个字符串时,保留其中最长的字符串

if len(result) >= 1:

#step1:对result[i]进行关键字过滤,和时间无关的字符被删掉

#只保留以下字符:

#中文:年月日星期本周上中下午晚早上一二两三四五六十时点半分次至下班之前明后全天

#数字:0123456789

#符号: :;()()--—- :;,,

#完整的:年月日星期本周上中下午晚早上一二两三四五六十时点半分次至下班之前明后全天m0123456789:;()()--—- :;,,

for i in range(len(result)):

a_shanchu = result[i]

list_re= re.findall('[^年月日星期本周上中下午晚早上一二两三四五六十时点半分次至下班之前明后全天m0123456789:;()()--—- :;,,]',a_shanchu)

if len(list_re) > 0:

for j in range(len(list_re)):

if len(list_re[j]) > 0 :

a_shanchu = re.sub(list_re[j],'',a_shanchu)

result[i] = a_shanchu

#step2:按长度比较‘时间’

if len(result) > 1:

a = ' '

for i in range(len(result)):

if len(result[i]) > len(a):

a = result[i]

result = [a]

#将"下班前"或者'前'字符串替换成下班时间:"17:00"

if len(result) > 0:

tihuan = re.findall('下班前',result[0])

if tihuan:

result[0] = re.sub('下班前','17:00',result[0])

if len(result) > 0:

tihuan = re.findall('日前',result[0])

if tihuan:

result[0] = re.sub('前','17:00',result[0])

#删除前缀,Sub:Count = 1

if len(result) > 0:

tihuan = re.findall('时',result[0])

if len(tihuan) > 2:

result[0] = re.sub('时','',result[0],1)

if len(result) > 0:

tihuan = re.findall('间',result[0],1)

if len(tihuan) > 2:

result[0] = re.sub('间','',result[0],1)

if len(result) > 0:

tihuan = re.findall(':',result[0],1)

if len(tihuan) > 2:

result[0] = re.sub(':','',result[0],1)

re_time.append(result)

re_time = pd.DataFrame(re_time,index = df.index)

out_time = pd.concat([df1,re_time],axis = 1)

out_time.to_excel('out_time.xlsx')

print ('Done')

为了方便,本程序的输入和输出都为Excel文件

运行结果:

AAffA0nNPuCLAAAAAElFTkSuQmCC

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值