从字串或文本中识别中英文日期、时间(自用)

 

def get_DateAndTime_fromString(str1:str, convert_cNumber_2_eNumber:bool=True, sortBySite:bool=True,replace_char:str = '*')->list:
    '''从字符串或文本中,找出日期时间类型
    str1:需要搜索的字符串或文本
    convert_cNumber_2_eNumber:在搜索之前,是否需要将文本中的中文数字,转换为英文数字,这样更便于发现文本中的日期时间型。
            默认为True
            若选择False,将不能发现中文日期
    sortBySite:若存在多个搜索结果时,按照何种方式排序?
            若为True,将按照所发现的日期时间字串,在文本中的位置排序;
            若选择False,将按照日期时间字串,所发现的先后顺序排序
    replace_char:已经发现的日期时间字串,需要用等长度的字符来替代,以确保下一个发现的信息不重复、位置正确。默认为'*'
            该字符,必须保持长度为1,不能为0,否则将得到错误结果
    返回值:list + dict类型([dict1,dict2,......])
            其中:dict = {'value': value, 'find_index': int, 'start': int,'length': int, 'before': str, 'after': str}
                    value:该日期时间类型的值。其类型包括datetime.datetime、datetime.time、datetime.date三类,可直接用type()获取
                    find_index:存在多个日期时间类型的值时,本日期类型的发现顺序
                    start:本日期时间型,在文本中的起始位置
                    length:本日期时间型字串的长度
                    before:在本日期时间型发现之前,文本的内容
                    after:在本日期时间型发现之后,文本的内容。
                           (所发现的日期时间字串,已用等长度的字串'*'所代替,以确保不重复、位置正确)'''
    '''检查参数replace_char'''
    if isinstance(replace_char,str) and len(replace_char) == 1:pass
    else:return False
    '''中文数字 ---> 英文数字'''
    if convert_cNumber_2_eNumber:
        string = my_number().convert_number_from_string(string = str1)
    else:string = str1
    '''date_regex'''
    year_regex = '([1-2]{1}[0-9]{3})[-年]{0,1}'
    month_regex = '([1]{0,1}[0-9]{1})[-月]{0,1}'
    day_regex = '([0-3]{0,1}[0-9]{1})[-日]{0,1}'
    date_regex = year_regex + month_regex + day_regex
    '''time_regex'''
    hour_regex = '([0-2]{0,1}[0-9]{1})[:时]{1}'
    minute_regex = '([0-5]{0,1}[0-9]{1})[:分]{1}'
    second_regex = '([0-5]{0,1}[0-9]{1})[秒]{0,1}'
    time_regex = hour_regex + minute_regex + second_regex
    '''datetime_regex'''
    datetime_regex = [date_regex + '[\s]*' + time_regex,
                      date_regex + '[T]{1}' + time_regex,
                      date_regex + '[\s]*' + hour_regex + '整',
                      date_regex + '[\s]*' + hour_regex + minute_regex]
    '''Begin'''
    str1,result = string, list()
    expresss = datetime_regex + [time_regex] + [date_regex]
    for express in expresss:
        regex = re.compile(express)
        for each in regex.finditer(str1):
            before,start,str2,length = str1,each.start(),each.group(),len(each.group())
            repalce_str = ''.join([replace_char for i in range(length)])
            after = str1 = str1.replace(str2,repalce_str)

            alls = regex.findall(str2)
            for all in alls:
                if express in datetime_regex:
                    year, month, day = int(all[0]), int(all[1]), int(all[2])
                    if len(all) == 4:hour, minute, second = int(all[3]), 0, 0
                    elif len(all) == 5:hour, minute, second = int(all[3]), int(all[4]), 0
                    else:hour, minute, second = int(all[3]), int(all[4]), int(all[5])

                    try:tmp = datetime.datetime(year, month, day, hour, minute, second)
                    except:pass
                    else:
                        tmp = {'value': tmp,'find_index':len(result),'start':start,'length':length,
                               'before':before,'after':after}
                        result.append(tmp.copy())
                elif express in [time_regex]:
                    hour, minute, second = int(all[0]), int(all[1]), int(all[2])
                    try:tmp = datetime.time(hour, minute, second)
                    except:pass
                    else:
                        tmp = {'value': tmp, 'find_index': len(result), 'start': start,
                               'length': length, 'before': before, 'after': after}
                        result.append(tmp.copy())
                elif express in [date_regex]:
                    year, month, day = int(all[0]), int(all[1]), int(all[2])
                    try:tmp = datetime.date(year, month, day)
                    except:pass
                    else:
                        tmp = {'value': tmp, 'find_index': len(result), 'start': start,
                               'length': length, 'before': before, 'after': after}
                        result.append(tmp.copy())
    '''返回结果排序'''
    if sortBySite:  # 按照时间日期,在字符串中的位置排序
        list1 = [each['start'] for each in result]
        list1.sort()
        for i in range(len(list1)):
            start = list1[i]
            for j in range(len(result)-1,-1,-1):
                each = result[j].copy()
                if each['start'] == start and result.index(each) != i:
                    result.pop(j)
                    result.insert(i,each)
    return result
class my_number:
    def __init__(self, **kwargs):
        self.kwargs = kwargs.copy()
        self.chi_single = {'0': ['〇', '零'], '1': ['一', '壹'], '2': ['二', '贰'], '3': ['三', '叁'], '4': ['四', '肆'], 
                           '5': ['五', '伍'],'6': ['六', '陆'], '7': ['七', '柒'], '8': ['八', '捌'], '9': ['九', '玖']}
        self.chi_double = {'10':['十', '拾'],'20':['廿'],'30':['卅'],'40':['卌']} #,'50':['圩'],'60':['圆'],'70':['进'],'80':['枯'],'90':['枠']}  # 50 以后,暂不启用,容易引发错误
    def chi_single_values(self):
        single_values = set()
        for k, v in self.chi_single.items():
            single_values = single_values | set(v)
        single_values = list(single_values)
        single_values.sort()
        return single_values
    def chi_double_values(self):
        # 廿:niàn,卅:sà,卌:xì,圩:wéi,
        double_values = set()
        for k, v in self.chi_double.items():
            double_values = double_values | set(v)
        double_values = list(double_values)
        double_values.sort()
        return double_values
    def convert_number_from_string(self,string:str):
        if isinstance(string,str):
            str1 = string
            numbers = self.chi_single_values() + self.chi_double_values()
            if len(str1) <= 0:return str1
            else:
                str2 = list()
                for i in range(len(str1)-1,-1,-1):
                    curr = str1[i]
                    if curr in numbers:
                        if curr in self.chi_single_values():
                            for key, attrs in self.chi_single.items():
                                if curr in attrs:
                                    str2.insert(0,key)
                                    break
                        elif curr in self.chi_double_values():
                            '''创建数字短语'''
                            if i == 0:short = str1[:2] if str1[1] in numbers else curr
                            elif i == len(str1) - 1:short = str1[-2:] if str1[-2] in numbers else curr
                            else:
                                prev,next = str1[i-1],str1[i + 1]
                                if prev in numbers:
                                    if next in numbers:short = prev + curr + next
                                    else:short = prev + curr
                                else:
                                    if next in numbers:short = curr + next
                                    else:short = curr
                            '''分析数字短语'''
                            for key, attrs in self.chi_double.items():
                                if curr in attrs:
                                    if len(short) <= 0:pass
                                    elif len(short) == 1:str2.insert(0, key)
                                    else:
                                        if curr == short[0]:str2.insert(0, key[0])
                                        elif curr == short[-1]:str2.insert(0, key[1])
                                        else:str2.insert(0, '')
                                    break
                        else:str2.insert(0,curr)
                    else:str2.insert(0,curr)
                return ''.join(str2)
        else:return string
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值