def get_DateAndTime_fromString(str1:str, convert_cNumber_2_eNumber:bool=True, sortBySite:bool=True,replace_char:str = '*')->list: '''从字符串或文本中,找出日期时间类型 str1:需要搜索的字符串或文本 convert_cNumber_2_eNumber:在搜索之前,是否需要将文本中的中文数字,转换为英文数字,这样更便于发现文本中的日期时间型。 默认为True 若选择False,将不能发现中文日期 sortBySite:若存在多个搜索结果时,按照何种方式排序? 若为True,将按照所发现的日期时间字串,在文本中的位置排序; 若选择False,将按照日期时间字串,所发现的先后顺序排序 replace_char:已经发现的日期时间字串,需要用等长度的字符来替代,以确保下一个发现的信息不重复、位置正确。默认为'*' 该字符,必须保持长度为1,不能为0,否则将得到错误结果 返回值:list + dict类型([dict1,dict2,......]) 其中:dict = {'value': value, 'find_index': int, 'start': int,'length': int, 'before': str, 'after': str} value:该日期时间类型的值。其类型包括datetime.datetime、datetime.time、datetime.date三类,可直接用type()获取 find_index:存在多个日期时间类型的值时,本日期类型的发现顺序 start:本日期时间型,在文本中的起始位置 length:本日期时间型字串的长度 before:在本日期时间型发现之前,文本的内容 after:在本日期时间型发现之后,文本的内容。 (所发现的日期时间字串,已用等长度的字串'*'所代替,以确保不重复、位置正确)''' '''检查参数replace_char''' if isinstance(replace_char,str) and len(replace_char) == 1:pass else:return False '''中文数字 ---> 英文数字''' if convert_cNumber_2_eNumber: string = my_number().convert_number_from_string(string = str1) else:string = str1 '''date_regex''' year_regex = '([1-2]{1}[0-9]{3})[-年]{0,1}' month_regex = '([1]{0,1}[0-9]{1})[-月]{0,1}' day_regex = '([0-3]{0,1}[0-9]{1})[-日]{0,1}' date_regex = year_regex + month_regex + day_regex '''time_regex''' hour_regex = '([0-2]{0,1}[0-9]{1})[:时]{1}' minute_regex = '([0-5]{0,1}[0-9]{1})[:分]{1}' second_regex = '([0-5]{0,1}[0-9]{1})[秒]{0,1}' time_regex = hour_regex + minute_regex + second_regex '''datetime_regex''' datetime_regex = [date_regex + '[\s]*' + time_regex, date_regex + '[T]{1}' + time_regex, date_regex + '[\s]*' + hour_regex + '整', date_regex + '[\s]*' + hour_regex + minute_regex] '''Begin''' str1,result = string, list() expresss = datetime_regex + [time_regex] + [date_regex] for express in expresss: regex = re.compile(express) for each in regex.finditer(str1): before,start,str2,length = str1,each.start(),each.group(),len(each.group()) repalce_str = ''.join([replace_char for i in range(length)]) after = str1 = str1.replace(str2,repalce_str) alls = regex.findall(str2) for all in alls: if express in datetime_regex: year, month, day = int(all[0]), int(all[1]), int(all[2]) if len(all) == 4:hour, minute, second = int(all[3]), 0, 0 elif len(all) == 5:hour, minute, second = int(all[3]), int(all[4]), 0 else:hour, minute, second = int(all[3]), int(all[4]), int(all[5]) try:tmp = datetime.datetime(year, month, day, hour, minute, second) except:pass else: tmp = {'value': tmp,'find_index':len(result),'start':start,'length':length, 'before':before,'after':after} result.append(tmp.copy()) elif express in [time_regex]: hour, minute, second = int(all[0]), int(all[1]), int(all[2]) try:tmp = datetime.time(hour, minute, second) except:pass else: tmp = {'value': tmp, 'find_index': len(result), 'start': start, 'length': length, 'before': before, 'after': after} result.append(tmp.copy()) elif express in [date_regex]: year, month, day = int(all[0]), int(all[1]), int(all[2]) try:tmp = datetime.date(year, month, day) except:pass else: tmp = {'value': tmp, 'find_index': len(result), 'start': start, 'length': length, 'before': before, 'after': after} result.append(tmp.copy()) '''返回结果排序''' if sortBySite: # 按照时间日期,在字符串中的位置排序 list1 = [each['start'] for each in result] list1.sort() for i in range(len(list1)): start = list1[i] for j in range(len(result)-1,-1,-1): each = result[j].copy() if each['start'] == start and result.index(each) != i: result.pop(j) result.insert(i,each) return result
class my_number: def __init__(self, **kwargs): self.kwargs = kwargs.copy() self.chi_single = {'0': ['〇', '零'], '1': ['一', '壹'], '2': ['二', '贰'], '3': ['三', '叁'], '4': ['四', '肆'], '5': ['五', '伍'],'6': ['六', '陆'], '7': ['七', '柒'], '8': ['八', '捌'], '9': ['九', '玖']} self.chi_double = {'10':['十', '拾'],'20':['廿'],'30':['卅'],'40':['卌']} #,'50':['圩'],'60':['圆'],'70':['进'],'80':['枯'],'90':['枠']} # 50 以后,暂不启用,容易引发错误 def chi_single_values(self): single_values = set() for k, v in self.chi_single.items(): single_values = single_values | set(v) single_values = list(single_values) single_values.sort() return single_values def chi_double_values(self): # 廿:niàn,卅:sà,卌:xì,圩:wéi, double_values = set() for k, v in self.chi_double.items(): double_values = double_values | set(v) double_values = list(double_values) double_values.sort() return double_values def convert_number_from_string(self,string:str): if isinstance(string,str): str1 = string numbers = self.chi_single_values() + self.chi_double_values() if len(str1) <= 0:return str1 else: str2 = list() for i in range(len(str1)-1,-1,-1): curr = str1[i] if curr in numbers: if curr in self.chi_single_values(): for key, attrs in self.chi_single.items(): if curr in attrs: str2.insert(0,key) break elif curr in self.chi_double_values(): '''创建数字短语''' if i == 0:short = str1[:2] if str1[1] in numbers else curr elif i == len(str1) - 1:short = str1[-2:] if str1[-2] in numbers else curr else: prev,next = str1[i-1],str1[i + 1] if prev in numbers: if next in numbers:short = prev + curr + next else:short = prev + curr else: if next in numbers:short = curr + next else:short = curr '''分析数字短语''' for key, attrs in self.chi_double.items(): if curr in attrs: if len(short) <= 0:pass elif len(short) == 1:str2.insert(0, key) else: if curr == short[0]:str2.insert(0, key[0]) elif curr == short[-1]:str2.insert(0, key[1]) else:str2.insert(0, '') break else:str2.insert(0,curr) else:str2.insert(0,curr) return ''.join(str2) else:return string