关键词提取与时间关键词的提取

套头衫01

于 2024-08-07 10:05:39 发布

阅读量215

点赞数 5

文章标签： python 开发语言

本文链接：https://blog.csdn.net/m0_51944012/article/details/140980666

版权

#!/usr/bin/python3

# -*- coding: utf-8 -*-

# author:SingWeek

import re

import jieba

from datetime import datetime,timedelta

from dateutil.parser import parse

import jieba

import jieba.posseg as psg

UTIL_CN_NUM={'零':0,'一':1,'二':2,'三':3,'四':4,'五':5,'六':6,'七':7,'八':8,'九':9,'0':0,'1':1,'2':2,'3':3,'4':4,'5':5,'6':6,'7':7,'8':8,'9':9}

UTIL_CN_UNIT={'十':10,'百':100,'千':1000,'万':10000}

# 时间日期抽取

def get_lastweek(day=1):

d = datetime.now()

dayscount = timedelta(days=d.isoweekday())

dayto = d - dayscount

sixdays = timedelta(days=7-day)

dayfrom = dayto - sixdays

date_from = datetime(dayfrom.year, dayfrom.month, dayfrom.day, 0, 0, 0)

return str(date_from)[0:4]+'年'+str(date_from)[5:7]+'月'+str(date_from)[8:10]+'日'

# 时间日期抽取

def get_nextweek(day=1):

d = datetime.now()

dayscount = timedelta(days=d.isoweekday())

dayto = d - dayscount

sixdays = timedelta(days=-7-day)

dayfrom = dayto - sixdays

date_from = datetime(dayfrom.year, dayfrom.month, dayfrom.day, 0, 0, 0)

return str(date_from)[0:4]+'年'+str(date_from)[5:7]+'月'+str(date_from)[8:10]+'日'

# 时间日期抽取

def get_week(day=1):

d = datetime.now()

dayscount = timedelta(days=d.isoweekday())

dayto = d - dayscount

sixdays = timedelta(days=-day)

dayfrom = dayto - sixdays

date_from = datetime(dayfrom.year, dayfrom.month, dayfrom.day, 0, 0, 0)

return str(date_from)[0:4]+'年'+str(date_from)[5:7]+'月'+str(date_from)[8:10]+'日'

# 时间日期抽取

def check_time_valid(word):

m=re.match("\d+$",word)

if m:

if len(word)<=6:

return None

word1=re.sub('[号|日]\d+$','日',word)

if word1!=word:

return check_time_valid(word)

else:

return word1

# 时间日期抽取

def cn2dig(src):

if src=="":

return None

m=re.match("\d+",src)

if m:

return int(m.group(0))

rsl=0

unit=1

for item in src[::-1]:

if item in UTIL_CN_UNIT.keys():

unit=UTIL_CN_UNIT[item]

elif item in UTIL_CN_NUM.keys():

num=UTIL_CN_NUM[item]

rsl+=num*unit

else:

return None

if rsl<unit:

rsl+=unit

return rsl

# 时间日期抽取

def year2dig(year):

res=''

for item in year:

if item in UTIL_CN_NUM.keys():

res=res+str(UTIL_CN_NUM[item])

else:

res=res+item

m=re.match("\d+",res)

if m:

if len(m.group(0))==2:

return int(datetime.today().year/100)*100+int(m.group(0))

else:

return int(m.group(0))

else:

return None

# 时间日期抽取

def parse_datetime(msg):

tmptime= datetime.today().strftime('%Y{y}%m{m}%d{d}%H{h}%M{m}%S{s}').format(y='年',m='月',d='日', h='时', M='分',s='秒')#获取年月日时分秒

if msg is None or len(msg)==0:

return None

try:

dt=parse(msg,fuzzy=True)

return dt.strftime('%Y-%m-%d %H:%M:%s')

except Exception as e:

m = re.match("([0-9零一二三四五六七八九十]+年)?([0-9零一二三四五六七八九十]+月)?([0-9零一二三四五六七八九十]+[号日])?([上中下午晚早]+)?([0-9零一二三四五六七八九十百]+[点:时])?([0-9零一二三四五六七八九十百]+分)?([0-9零一二三四五六七八九十百]+秒)?",msg)

if m.group(0) is not None:

res={"year":m.group(1) if m.group(1) is not None else str(tmptime[0:5]),

"month":m.group(2) if m.group(2) is not None else str(tmptime[5:8]),

"day":m.group(3) if m.group(3) is not None else str(tmptime[8:11]),

"hour":m.group(5) if m.group(5) is not None else '00',

"minute": m.group(6) if m.group(6) is not None else '00',

"second": m.group(7) if m.group(7) is not None else '00',}

# print("匹配",res)

params={}

for name in res:

if res[name] is not None and len(res[name]) !=0:

tmp=None

if name=='year':

tmp=year2dig(res[name][:-1])

else:#时间格式转换

tmp=cn2dig(res[name][:-1])

if tmp is not None:

params[name]=int(tmp)

# print("----------------------》",params)

target_date=datetime.today().replace(**params)#用新的时间参数替换当前的时间

is_pm=m.group(4)

if is_pm is not None:

if is_pm==u'下午' or is_pm==u'晚上' or is_pm=='中午':

hour=params['hour']

if hour<12:

target_date=target_date.replace(hour=hour+12)

return target_date.strftime('%Y-%m-%d %H:%M:%S')

else:

return None

# 时间日期抽取

def time_extract(text):

time_res=[]

word=''

keyDate={'今天':0,'明天':1,'后天':2,'昨天':-1,'前天':-2}

timedic=['时','分','到']

tmptext=[]

for k,v in psg.cut(text):

tmptext.append([k,v])

for i in range(len(tmptext)):

k,v=tmptext[i][0],tmptext[i][1]

if k in keyDate:#今天、明天、后天、昨天、前天具体时间提取计算

word = (datetime.today() + timedelta(days=keyDate.get(k, 0))).strftime('%Y{y}%m{m}%d{d}').format(y='年',m='月',d='日')

elif k =='到':#时间段提取

if word!='':

time_res.append(word)

word = ''

elif word != '':

if v in ['m', 't']:

try:

if tmptext[i+1][0] in timedic:

word = word + k+tmptext[i+1][0]

else:

word = word + k

except:

word = word + k

elif k not in timedic:

time_res.append(word)

word = ''

elif v in ['m', 't']:

word = k

if word != '':

time_res.append(word)

tmp_time_res = []

for i in range(len(time_res)):

if time_res[i][:2] in ['上周','下周']:

if time_res[i][2:3] in UTIL_CN_NUM.keys():

day=UTIL_CN_NUM[time_res[i][2:3]]

if time_res[i][:2]=='上周':

tmp_time_res.append(get_lastweek(day)+time_res[i][3:])

else:

tmp_time_res.append(get_nextweek(day) + time_res[i][3:])

elif time_res[i][:1] == '周':

if time_res[i][1:2] in UTIL_CN_NUM.keys():

day=UTIL_CN_NUM[time_res[i][1:2]]

if time_res[i][:1]=='周':

tmp_time_res.append(get_week(day)+time_res[i][2:])

else:

tmp_time_res.append(time_res[i])

time_res=tmp_time_res

try:

# print("匹配字符串：",time_res)

result=list(filter(lambda x:x is not None, [check_time_valid(w) for w in time_res]))

final_res=[parse_datetime(w) for w in result]

return [x for x in final_res if x is not None]

except:

return None

# 关键词提取

def chinese_word_cut(mytext):

jieba.load_userdict('jiebacfg.txt') # 这里你可以添加jieba库识别不了的网络新词，避免将一些新词拆开

jieba.initialize() # 初始化jieba

# 文本预处理：去除一些无用的字符只提取出中文出来

new_data = re.findall('[\u4e00-\u9fa5]+', mytext, re.S)

new_data = " ".join(new_data)

# 文本分词

seg_list_exact = jieba.lcut(new_data)

result_list = []

# 读取停用词库

with open('stop_word.txt', encoding='utf-8') as f: # 可根据需要打开停用词库，然后加上不想显示的词语

con = f.readlines()

stop_words = set()

for i in con:

i = i.replace("\n", "") # 去掉读取每一行数据的\n

stop_words.add(i)

# 去除停用词并且去除单字

for word in seg_list_exact:

if word not in stop_words and len(word) > 1:

result_list.append(word)

return result_list

k = 0

with open("jiebacfg.txt","r",encoding="utf-8") as f:

lines = f.readlines()

for line in lines:

#print(type(line))

# text3 = input("输入问题:")

text3 = "我们公司的"+line+"在八月二十一日到八月二十二日的消息"

print((time_extract(text3)) + list(set(chinese_word_cut(text3))))

k = k + 1

if(k>=10):

break

套头衫01

关注

5
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫