根据我的上一篇博客:https://www.imooc.com/search/article?words=迷之语法
我又写了一个简单应用来识别并提取一些文章或者通知的重要日期,目前调试结果还行,但是还存在一些漏洞,比如同时识别好几个日期,还没加条件来判断哪个更重要,阅者可以发现并根据需要来完善。
完整代码:
import re
import pandas as pd
pattern1 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]*[上中下午晚早上]*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+(\d+分)*)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))(--|-次日|-|至次日|至)*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+(\d+分)*)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')
pattern16 = re.compile(r'(\d+年+)*(\d+月+)*(\d+日+)*[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]+[上中下午晚早上]*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+(\d+分)*)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))(--|-次日|-|至次日|至)*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+(\d+分)*)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')
pattern2 = re.compile(r'[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]+(\d+年+)*(\d+月+)*(\d+日+)*[上中下午晚早上]*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))(--|-次日|-|至次日|至)*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')
pattern3 = re.compile(r'(\d+年)*(\d+月)*(\d+日)*[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]+([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*(-)*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*')
pattern4 = re.compile(r'(\d+\-\d+\-\d+)+(.)*[上中下午晚早上]*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*(-)*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*')
pattern5 = re.compile(r'(\d+年+)*(\d+月+)*(\d+日+)*(下班前|之前|前)+[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]+(下班前|之前|前)+')
pattern6 = re.compile(r'(\d+年+)*(\d+月+)*(\d+日+)*(下班前|之前|前)*[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]+[上中下午晚早上]*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二点]+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')
pattern7 = re.compile(r'(\d+年+)*(\d+月+)*(\d+日+)*[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]+[上中下午晚早上]*(\d+(点半|点)+)+(\d+分)+')
pattern8 = re.compile(r'(\d+年+)*(\d+月+)*(\d+日+)*[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]+[\(]*(([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*(-|至)*(([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*[\)]*')
pattern9 = re.compile(r'[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]*(\d+年+)*(\d+月+)+(\d+日+)+[上中下午晚早上]+(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))(--|-次日|-|至次日|至)*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')
pattern10 = re.compile(r'(\d+年)*(\d+月)+(\d+日)+[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*(-)*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*')
pattern11 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+(.)+[上中下午晚早上]*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*(-)*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*')
pattern12 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+(下班前|之前|前)+[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]*(下班前|之前|前)+')
pattern18 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+(下班前|之前|前)+[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]*(下班前|之前|前)*')
pattern19 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+(下班前|之前|前)*[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]*(下班前|之前|前)+')
pattern13 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+(下班前|之前|前)+[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]*[上中下午晚早上]*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二点]+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')
pattern20 = re.compile(r'[明后天]+(\()+(\d+月+)+(\d+日+)+(\))+(全天)+')
pattern14 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]*[上中下午晚早上]*(\d+(点半|点)+)+(\d+分)+')
pattern15 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]*[\(]*(下班前|之前|前)+')
pattern17 = re.compile(r'(\d+年+)*(\d+月+)*(\d+日+)*[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]*[上中下午晚早上]+(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))(--|-次日|-|至次日|至)*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')
path = "E:/NLP/DataSet/Data"
df1 = pd.read_excel(r'E:/NLP/DataSet/Data/data.xlsx',sheet_name = 0)
df = df1.iloc[:,0]
re_time = []
for i in range(len(df)):
lines = str(df[i]).split("于")
result = []
for line in lines:
a1 = line
m1 = pattern1.search(a1)
m2 = pattern2.search(a1)
m3 = pattern3.search(a1)
m4 = pattern4.search(a1)
m5 = pattern5.search(a1)
m6 = pattern6.search(a1)
m7 = pattern7.search(a1)
m8 = pattern8.search(a1)
m9 = pattern9.search(a1)
m10 = pattern10.search(a1)
m11 = pattern11.search(a1)
m12 = pattern12.search(a1)
m13 = pattern13.search(a1)
m14 = pattern14.search(a1)
m15 = pattern15.search(a1)
m16 = pattern16.search(a1)
m17 = pattern17.search(a1)
m18 = pattern18.search(a1)
m19 = pattern19.search(a1)
m20 = pattern20.search(a1)
#按照复杂度,对正则表达式的判断顺序排了个序
#顺序越靠前,筛选条件越苛刻
if m4:
print(m4.group(0))
b = re.search('\d',m4.group(0))
if b:
result.append(m4.group(0))
elif m5:
print(m5.group(0))
b = re.search('\d',m5.group(0))
if b:
result.append(m5.group(0))
elif m7:
print(m7.group(0))
b = re.search('\d',m7.group(0))
if b:
result.append(m7.group(0))
elif m1:
print(m1.group(0))
b = re.search('\d',m1.group(0))
if b:
result.append(m1.group(0))
elif m16:
print(m16.group(0))
b = re.search('\d',m16.group(0))
if b:
result.append(m16.group(0))
elif m2:
print(m2.group(0))
b = re.search('\d',m2.group(0))
if b:
result.append(m2.group(0))
elif m3:
print(m3.group(0))
b = re.search('\d',m3.group(0))
if b:
result.append(m3.group(0))
elif m6:
print(m6.group(0))
b = re.search('\d',m6.group(0))
if b:
result.append(m6.group(0))
elif m8:
print(m8.group(0))
b = re.search('\d',m8.group(0))
if b:
result.append(m8.group(0))
elif m9:
print(m9.group(0))
b = re.search('\d',m9.group(0))
if b:
result.append(m9.group(0))
elif m10:
print(m10.group(0))
b = re.search('\d',m10.group(0))
if b:
result.append(m10.group(0))
elif m12:
print(m12.group(0))
b = re.search('\d',m12.group(0))
if b:
result.append(m12.group(0))
elif m18:
print(m18.group(0))
b = re.search('\d',m18.group(0))
if b:
result.append(m18.group(0))
elif m19:
print(m19.group(0))
b = re.search('\d',m19.group(0))
if b:
result.append(m19.group(0))
elif m13:
print(m13.group(0))
b = re.search('\d',m13.group(0))
if b:
result.append(m13.group(0))
elif m20:
print(m20.group(0))
b = re.search('\d',m20.group(0))
if b:
result.append(m20.group(0))
elif m14:
print(m14.group(0))
b = re.search('\d',m14.group(0))
if b:
result.append(m14.group(0))
elif m15:
print(m15.group(0))
b = re.search('\d',m15.group(0))
if b:
result.append(m15.group(0))
elif m17:
print(m17.group(0))
b = re.search('\d',m17.group(0))
if b:
result.append(m17.group(0))
elif m11:
print(m11.group(0))
b = re.search('\d',m11.group(0))
if b:
result.append(m11.group(0))
#当列表有不止一个字符串时,保留其中最长的字符串
if len(result) >= 1:
#step1:对result[i]进行关键字过滤,和时间无关的字符被删掉
#只保留以下字符:
#中文:年月日星期本周上中下午晚早上一二两三四五六十时点半分次至下班之前明后全天
#数字:0123456789
#符号: :;()()--—- :;,,
#完整的:年月日星期本周上中下午晚早上一二两三四五六十时点半分次至下班之前明后全天m0123456789:;()()--—- :;,,
for i in range(len(result)):
a_shanchu = result[i]
list_re= re.findall('[^年月日星期本周上中下午晚早上一二两三四五六十时点半分次至下班之前明后全天m0123456789:;()()--—- :;,,]',a_shanchu)
if len(list_re) > 0:
for j in range(len(list_re)):
if len(list_re[j]) > 0 :
a_shanchu = re.sub(list_re[j],'',a_shanchu)
result[i] = a_shanchu
#step2:按长度比较‘时间’
if len(result) > 1:
a = ' '
for i in range(len(result)):
if len(result[i]) > len(a):
a = result[i]
result = [a]
#将"下班前"或者'前'字符串替换成下班时间:"17:00"
if len(result) > 0:
tihuan = re.findall('下班前',result[0])
if tihuan:
result[0] = re.sub('下班前','17:00',result[0])
if len(result) > 0:
tihuan = re.findall('日前',result[0])
if tihuan:
result[0] = re.sub('前','17:00',result[0])
#删除前缀,Sub:Count = 1
if len(result) > 0:
tihuan = re.findall('时',result[0])
if len(tihuan) > 2:
result[0] = re.sub('时','',result[0],1)
if len(result) > 0:
tihuan = re.findall('间',result[0],1)
if len(tihuan) > 2:
result[0] = re.sub('间','',result[0],1)
if len(result) > 0:
tihuan = re.findall(':',result[0],1)
if len(tihuan) > 2:
result[0] = re.sub(':','',result[0],1)
re_time.append(result)
re_time = pd.DataFrame(re_time,index = df.index)
out_time = pd.concat([df1,re_time],axis = 1)
out_time.to_excel('out_time.xlsx')
print ('Done')
为了方便,本程序的输入和输出都为Excel文件
运行结果: