Python实现文本简单正则匹配代码如下:
#例1:获取包含'爬虫'这个关键词的句子
#匹配字符串用re.search的方法——re.search(reger,string)
import re
text_string = '文本最重要的来源无疑是网络。我们要把网络中的文本获取形成一个文本数据库。利用一个爬虫抓取到网络中的信息。' \
'爬取的策略有广度爬取和深度爬取。根据用户的需求,爬取可以有主题爬取和通用爬虫之分。'
regex = '文本'
p_string = text_string.split('。') #以句号为分隔符通过split切分
for line in p_string:
if re.search(regex,line) is not None: #search方法是用来查找匹配当前行是否匹配这个regex,返回的是一个match对象
print(line)
print()
#例二:匹配任意一个字符
#用 . 来匹配任意一个字符,即代替任何单个字符(换行符除外)
import re
text_string = '文本最重要的来源无疑是网络。我们要把网络中的文本获取形成一个文本数据库。利用一个爬虫抓取到网络中的信息。' \
'爬取的策略有广度爬取和深度爬取。根据用户的需求,爬取可以有主题爬取和通用爬虫之分。'
regex = '用户.'
p_string = text_string.split('。')
for line in p_string:
if re.search(regex,line) is not None:
print(line)
print()
#例3:匹配开始和结尾字符串
# 字符 ^ 匹配开始字符串;字符 $ 匹配结尾字符串
import re
text_string = '文本最重要的来源无疑是网络。我们要把网络中的文本获取形成一个文本数据库。利用一个爬虫抓取到网络中的信息。' \
'爬取的策略有广度爬取和深度爬取。根据用户的需求,爬取可以有主题爬取和通用爬虫之分。'
# regex = '^文本'
regex = '信息$'
p_string = text_string.split('。')
for line in p_string:
if re.search(regex,line) is not None:
print(line)
print()
#例4:使用中括号匹配多个字符串
# 字符 [] 匹配多个字符串
import re
text_string = '[重要的]今年的第七号台风23日登陆广东沿海地区。上海发布暴雨蓝色预警。[紧要的]中国对印度连发强硬信息,印度急切需要结束对峙'
regex = '^\[[重紧]..\]'
p_string = text_string.split('。')
for line in p_string:
if re.search(regex,line) is not None:
print(line)
else:
print('not match')
print()
#使用转义字符:
import re
if re.search(r'\\','I have one nee\dle') is not None:
print('match it')
else:
print('not match')
print()
#抽取文本中的数字
# [0-9]代表从0-9的数字 [a-z]代表冲a-z所有的小写字母
import re
year_string = []
strings = ['war of 1812','there are 5280 feet to a mile','happy new year 2016']
for string in strings:
if re.search('[1-2][0-9]{3}',string): #即匹配从1000-2999之间的数字 {3}即重复之前的[0-9]3次
year_string.append(string)
print(year_string)
print()
#抽取所有的年份
#re,findall('[a-z]','abc1234')得到的结果是['a','b','c']
import re
years_string = '2018 was a bad year,but 2019 will be better!july 16,2017。16/07/2009。summer 2008'
years = re.findall('[1-2][0-9]{3}',years_string)
print(years)
日期识别代码如下:
import re
from datetime import datetime,timedelta
from dateutil.parser import parse
import jieba.posseg as psg
UTIL_CN_NUM = {
'零': 0, '一': 1, '二': 2, '两': 2, '三': 3, '四': 4,
'五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
'0': 0, '1': 1, '2': 2, '3': 3, '4': 4,
'5': 5, '6': 6, '7': 7, '8': 8, '9': 9
}
UTIL_CN_UNIT = {'十': 10, '百': 100, '千': 1000, '万': 10000}
def cn2dig(src):
if src == "":
return None
m = re.match("\d+", src)
if m:
return int(m.group(0))
rsl = 0
unit = 1
for item in src[::-1]:
if item in UTIL_CN_UNIT.keys():
unit = UTIL_CN_UNIT[item]
elif item in UTIL_CN_NUM.keys():
num = UTIL_CN_NUM[item]
rsl += num * unit
else:
return None
if rsl < unit:
rsl += unit
return rsl
def year2dig(year):
res = ''
for item in year:
if item in UTIL_CN_NUM.keys():
res = res + str(UTIL_CN_NUM[item])
else:
res = res + item
m = re.match("\d+", res)
if m:
if len(m.group(0)) == 2:
return int(datetime.datetime.today().year/100)*100 + int(m.group(0))
else:
return int(m.group(0))
else:
return None
def parse_datetime(msg):
if msg is None or len(msg) == 0:
return None
try:
dt = parse(msg, fuzzy=True)
return dt.strftime('%Y-%m-%d %H:%M:%S')
except Exception as e:
m = re.match(
r"([0-9零一二两三四五六七八九十]+年)?([0-9一二两三四五六七八九十]+月)?([0-9一二两三四五六七八九十]+[号日])?([上中下午晚早]+)?([0-9零一二两三四五六七八九十百]+[点:\.时])?([0-9零一二三四五六七八九十百]+分?)?([0-9零一二三四五六七八九十百]+秒)?",
msg)
if m.group(0) is not None:
res = {
"year": m.group(1),
"month": m.group(2),
"day": m.group(3),
"hour": m.group(5) if m.group(5) is not None else '00',
"minute": m.group(6) if m.group(6) is not None else '00',
"second": m.group(7) if m.group(7) is not None else '00',
}
params = {}
for name in res:
if res[name] is not None and len(res[name]) != 0:
tmp = None
if name == 'year':
tmp = year2dig(res[name][:-1])
else:
tmp = cn2dig(res[name][:-1])
if tmp is not None:
params[name] = int(tmp)
target_date = datetime.today().replace(**params)
is_pm = m.group(4)
if is_pm is not None:
if is_pm == u'下午' or is_pm == u'晚上' or is_pm =='中午':
hour = target_date.time().hour
if hour < 12:
target_date = target_date.replace(hour=hour + 12)
return target_date.strftime('%Y-%m-%d %H:%M:%S')
else:
return None
def check_time_valid(word):
m = re.match("\d+$", word)
if m:
if len(word) <= 6:
return None
word1 = re.sub('[号|日]\d+$', '日', word)
if word1 != word:
return check_time_valid(word1)
else:
return word1
#时间提取
def time_extract(text):
time_res = []
word = ''
keyDate = {'今天': 0, '明天':1, '后天': 2}
for k, v in psg.cut(text):
if k in keyDate:
if word != '':
time_res.append(word)
word = (datetime.today() + timedelta(days=keyDate.get(k, 0))).strftime('%Y{y}%m{m}%d{d}').format(y='年',m='月',d='日')
elif word != '':
if v in ['m', 't']:
word = word + k
else:
time_res.append(word)
word = ''
elif v in ['m', 't']:
word = k
if word != '':
time_res.append(word)
result = list(filter(lambda x: x is not None, [check_time_valid(w) for w in time_res]))
final_res = [parse_datetime(w) for w in result]
return [x for x in final_res if x is not None]
text1 = '我要住到明天下午三点'
print(text1, time_extract(text1), sep=':')
text2 = '预定28号的房间'
print(text2, time_extract(text2), sep=':')
text3 = '我要从26号下午4点住到11月2号'
print(text3, time_extract(text3), sep=':')
text4 = '我要预订今天到30的房间'
print(text4, time_extract(text4), sep=':')
text5 = '今天30号呵呵'
print(text5, time_extract(text5), sep=':')
日期识别结果:
来源:Python自然语言处理实战