0. 说明
这是我自己写的一个时间标准化工具,功能是把一些常见的时间词描述转换为相对标准的YYYY-MM-DD格式。原理很简单,就是写规则匹配。
但是我刚写完这个代码之后,就找到了更好的项目,所以就把这个方法给淘汰了。
相比之下,这个功能不是很完备,也有很多亟待优化的地方,但是有这样一个方便的工具,总好过没有,所以把这个工具放出来给有需要的同学们使用。
1. 示例
等一下我们会写一个TimeNormalizer
,现在假设已经写好了,我们来实例化它,然后就可以使用了:
tn = TimeNormalizer()
norm_time = tn.normalize('Sep. 14, 2009')
# 2009-09-14
norm_time = tn.normalize('in July, 1999')
# 1999-07-01__1999-07-31
norm_time = tn.normalize('in Friday')
# 2022-08-05
norm_time = tn.normalize('last Friday')
# 2022-07-29
norm_time = tn.normalize('early Friday')
# 2022-08-05:06__2022-08-05:12
norm_time = tn.normalize('in the autumn of 1997')
# 1997-09-01__1997-11-30
norm_time = tn.normalize('in the past thirty-five days')
# 2022-06-29__2022-08-03
norm_time = tn.normalize('yesterday night')
# 2022-08-02:20__2022-08-02:24
可以看到,其中很多时间描述词都是相对时间,所以我们需要一个基准时间,于是在设计这个类的时候,有一个参数pub_time
(因为我原本是设计的以新闻的发布时间作为基准,所以起名叫publish了),这个参数需要是一个自定义的时间格式类型,下面介绍。
2. 时间格式
开始之前先import一下:
import re
from datetime import datetime, timedelta
在这里我定义了两种时间类型,时间点
和时间片段
,前者是用来描述一个时刻,例如某一天,后者则例如一个月,一年等。
时间点的定义如下:
class StandardTimePoint:
"""
标准时间点
---------------
ver: 2021-12-13
by: changhongyu
"""
def __init__(self, std_y=None, std_m=None, std_d=None, std_h=None, pub_time=None, require_pub_time=True):
"""
:param std_y: str: 标准年
:param std_m: str: 标准月
:param std_d: str: 标准日
:param std_h: str: 标准时
:param pub_time: StandardTimePoint: 基准时间
:param require_pub_time: bool: 是否需要传入基准时间
"""
self.specific = '' # 用于记录哪些字段是一开始就有确切数值的
assert std_d, Exception("If you don't specify the date, this time object is a time period.")
if std_y:
assert len(std_y) == 4, ValueError("Attribute `std_y` must have format like `2021`.")
self.specific += 'y'
if std_m:
assert len(std_m) == 2, ValueError("Attribute `std_m` must have format like `01`.")
self.specific += 'm'
if std_d:
assert len(std_d) == 2, ValueError("Attribute `std_d` must have format like `31`.")
self.specific += 'd'
if std_h:
assert len(std_h) == 2, ValueError("Attribute `std_h` must have format like `24`.")
self.specific += 'h'
self.std_y = std_y
self.std_m = std_m
self.std_d = std_d
self.std_h = std_h
if require_pub_time:
if pub_time:
self.pub_time = pub_time
else:
pub_time = datetime.now()
_month = str(pub_time.month)
_day = str(pub_time.day)
if len(_day) == 1:
_day = '0' + _day
if len(_month) == 1:
_month = '0' + _month
self.pub_time = StandardTimePoint(std_y=str(pub_time.year),
std_m=_month,
std_d=_day,
require_pub_time=False)
self._fill()
def __repr__(self):
msg = ''
if self.std_y:
msg += self.std_y
if self.std_m:
if len(msg):
msg += '-'
msg += self.std_m
if self.std_d:
if len(msg) and not self.std_m:
raise Exception("Unspecific month.")
if len(msg):
msg += '-'
msg += self.std_d
if not len(msg):
raise Exception("Empty date.")
if self.std_h:
msg += ':'
msg += self.std_h
return msg
def _fill(self):
if not self.std_y:
self.std_y = self.pub_time.std_y
if not self.std_m:
self.std_m = self.pub_time.std_m
时间片段的定义如下:
class StandardTimePeriod:
"""
标准时间片段
---------------
ver: 2021-12-13
by: changhongyu
"""
def __init__(self, std_y_s=None, std_m_s=None, std_d_s=None, std_h_s=None,
std_y_e=None, std_m_e=None, std_d_e=None, std_h_e=None, pub_time=None, require_pub_time=True):
"""
:param std_y_s: str: 标准年, 开始
:param std_m_s: str: 标准月, 开始
:param std_d_s: str: 标准日, 开始
:param std_h_s: str: 标准时, 开始
:param std_y_e: str: 标准年, 结束
:param std_m_e: str: 标准月, 结束
:param std_d_e: str: 标准日, 结束
:param std_h_e: str: 标准时, 结束
:param pub_time: StandardTimePoint: 基准时间
:param require_pub_time: bool: 是否需要传入基准时间
"""
self.specific = '' # 用于记录哪些字段是一开始就有确切数值的
assert std_y_s or std_m_s or std_d_s, Exception("Empty date.")
if std_y_s:
assert std_y_e, Exception("If there is a start time, the end time is expected.")
assert len(std_y_s) == len(std_y_e) == 4, \
ValueError("Attribute `std_y` must have format like `2021`.")
self.specific += 'y'
if std_m_s:
assert len(std_m_s) == len(std_m_e) == 2, \
ValueError("Attribute `std_m` must have format like `01`.")
self.specific += 'm'
if std_d_s:
assert len(std_d_s) == len(std_d_e) == 2, \
ValueError("Attribute `std_d` must have format like `31`.")
self.specific += 'd'
if std_h_s:
assert len(std_h_s) == len(std_h_e) == 2, \
ValueError("Attribute `std_h` must have format like `24`.")
self.specific += 'h'
if require_pub_time:
if pub_time:
self.pub_time = pub_time
else:
pub_time = datetime.now()
base_month = str(pub_time.month)
base_day = str(pub_time.day)
if len(base_month) == 1:
base_month = '0' + base_month
if len(base_day) == 1:
base_day = '0' + base_day
self.pub_time = StandardTimePoint(std_y=str(pub_time.year),
std_m=base_month,
std_d=base_day,
require_pub_time=False)
self.month_days = {i + 1: num for i, num in enumerate([31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31])}
self.std_y_s = std_y_s
self.std_m_s = std_m_s
self.std_d_s = std_d_s
self.std_h_s = std_h_s
self.std_y_e = std_y_e
self.std_m_e = std_m_e
self.std_d_e = std_d_e
self.std_h_e = std_h_e
self._fill()
self.start_time = StandardTimePoint(self.std_y_s, self.std_m_s, self.std_d_s, self.std_h_s)
self.end_time = StandardTimePoint(self.std_y_e, self.std_m_e, self.std_d_e, self.std_h_e)
def _fill(self):
if not self.std_y_s:
# 时间缺省,用基准时间补
self.std_y_s = self.pub_time.std_y
self.std_y_e = self.pub_time.std_y
if not self.std_m_s:
# 月份缺省,补全年
self.std_m_s = '01'
self.std_m_e = '12'
if not self.std_d_s:
# 日期缺省,补整月
self.std_d_e = str(self.month_days[int(self.std_m_e)])
self.std_d_s = '01'
# 闰月调整
if int(self.std_y_e) % 4 == 0 and int(self.std_y_e) % 100 != 0 and self.std_m_e == '02':
self.std_d_e = '29'
def __repr__(self):
msg = ''
msg += self.std_y_s
msg += '-'
msg += self.std_m_s
msg += '-'
msg += self.std_d_s
if self.std_h_s:
msg += ':'
msg += self.std_h_s
msg += '__'
msg += self.std_y_e
msg += '-'
msg += self.std_m_e
msg += '-'
msg += self.std_d_e
if self.std_h_e:
msg += ':'
msg += self.std_h_e
return msg
那么当我们希望传入一个基准时间时,则需要实例化一个时间点:
std = StandardTimePoint(std_y='2022', std_m='01', std_d='01')
然后把它作为参数传给标准化工具进行实例化,就可以以指定的日期作为基准时间了,例如:
tn = TimeNormalizer(std)
tn.normalize('yesterday')
# 2021-12-31
3. 主类
下面是主类,直接上代码:
class TimeNormalizer:
"""
【时间标准化主类】
分为三个方面对时间描述进行标准化:
1. 基本格式化
2. 节日, 特殊日期计算(TODO)
3. 描述性推断
具体过程:
1) 先对完全标准结构进行匹配,
例如`20020101`, `2002-01-01`, `2002-01-01:05`
2) 然后匹配描述性推断,
例如`last 3 years`
3) 接下来处理半标准结构的十年
例如`in the 90s`
4) 半标准结构的年月日(包含缺省)
例如`Nov. 1st, 1925`
5) 时间分割切片
例如`in the autumn of 1923`
6) 最后处理简单结构
例如`in Summer`, `on Wednesday`, `yesterday`
---------------
ver: 2021-12-16
by: changhongyu
"""
def __init__(self, pub_time: StandardTimePoint = None):
"""
:param pub_time: StandardTimePoint: 新闻时间作为基准时间
"""
if not pub_time:
pub_time = datetime.now()
base_month = str(pub_time.month)
base_day = str(pub_time.day)
if len(base_month) == 1:
base_month = '0' + base_month
if len(base_day) == 1:
base_day = '0' + base_day
self.pub_time = StandardTimePoint(std_y=str(pub_time.year),
std_m=base_month,
std_d=base_day,
require_pub_time=False)
else:
self.pub_time = pub_time
# 0-100的英文转数字
num_str = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen',
'nineteen']
num_str_ty = ['twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
num_str_to_int = {n: i for i, n in enumerate(num_str)}
for i, ty in enumerate(num_str_ty):
num_str_to_int[ty] = (i + 2) * 10
for j in range(1, 10):
ns = ty + '-' + num_str[j]
num_str_to_int[ns] = (i + 2) * 10 + j
# 做个反转,防止个位数先于十位数被匹配到
key_list = [k for k in num_str_to_int.keys()][::-1]
self.num_str_to_int = {k: num_str_to_int[k] for k in key_list}
# 1. 基本格式化, 需注意所有表述均为title形式
# 1) 月份相关
self.month_long_string = ["January", "February", "March", "April", "May", "June",
"July", "August", "September", "October", "November", "December"]
self.month_short_string = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
self.num_days_of_every_month = [str(n) for n in [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]]
# 2) 星期相关
self.weekday_long_string = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
self.weekday_short_string = ['Mon', 'TUE', 'WED', 'Thu', 'Fri', 'Sat', 'Sun']
# 3) 分割描述
self.y_sp = {'Early': ['01', '04'], 'Mid': ['05', '08'], 'Late': ['09', '12'],
'Spring': ['03', '05'], 'Summer': ['06', '08'], 'Autumn': ['09', '11'], 'Winter': ['12', '02'],
'01+Quarter': ['01', '03'], '02+Quarter': ['04', '06'],
'03+Quarter': ['07', '09'], '04+Quarter': ['10', '12'], 'Last+Quarter': ['10', '12']}
self.m_sp = {'Early': ['01', '10'], 'Mid': ['11', '20'], 'Late': ['21', '31']}
self.d_sp = {'Morning': ['06', '10'], 'Noon': ['11', '14'], 'Afternoon': ['15', '18'], 'Evening': ['19', '22'],
'Dawn': ['05', '07'], 'Day': ['08', '16'], 'Dusk': ['17', '19'], 'Night': ['20', '24'],
'Mid+Night': ['01', '03'], 'Mid+Day': ['11', '13'],
'Early': ['06', '12'], 'Late': ['18', '24']}
# 2. 节日计算
# 3. 描述性推断
# 1) 介词推断
self.preps = [s.title for s in ['on', 'in', 'at', 'before', 'after', 'during', 'since', 'toward']]
# 2) 描述词
self.present_words = ['This', 'Current', 'Present']
self.previous_words = ['Past', 'Last', 'Previous']
self.future_words = ['Next', 'Following', 'Coming', 'Up-Coming', 'Upcoming']
self.delta_dict = {
"Day": {'y': 0, 'm': 0, 'd': 1},
"Week": {'y': 0, 'm': 0, 'd': 7},
"Fortnight": {'y': 0, 'm': 0, 'd': 14},
"Month": {'y': 0, 'm': 1, 'd': 0},
"Season": {'y': 0, 'm': 3, 'd': 0},
"Quarter": {'y': 0, 'm': 3, 'd': 0},
"Year": {'y': 1, 'm': 0, 'd': 0},
"Decade": {'y': 10, 'm': 0, 'd': 0},
"Century": {'y': 100, 'm': 0, 'd': 0},
"Centuries": {'y': 100, 'm': 0, 'd': 0},
}
def _init_pub(self):
"""
每次抽取完之后重置基准时间
"""
pub_time = datetime.now()
base_month = str(pub_time.month)
base_day = str(pub_time.day)
if len(base_month) == 1:
base_month = '0' + base_month
if len(base_day) == 1:
base_day = '0' + base_day
self.pub_time = StandardTimePoint(std_y=str(pub_time.year),
std_m=base_month,
std_d=base_day,
require_pub_time=False)
def normalize(self, str_time, pub_time=None):
"""
将一个时间字符串标准化
同类规则按照从复杂到简单的顺序对内容进行结构化
当前边复杂的规则匹配到,则直接return结果
否则会继续判断是否符合简单的规则
:param str_time: str: 抽取出来的时间
:param pub_time: StandardTimePoint
:return:
TODO: 顺序调整
"""
if pub_time:
self.pub_time = pub_time
# <0 完全标准结构, `20020101`, `2002-01-01`, `2002-01-01:05`
normed_time = self._normalize_rule_0(str_time)
if normed_time:
self._init_pub()
return normed_time
# <4 `last 3 years`
normed_time = self._normalize_rule_4(str_time)
if normed_time:
self._init_pub()
return normed_time
# <1 抽取十年单位, `in the 90s`
normed_time = self._normalize_rule_1(str_time)
if normed_time:
self._init_pub()
return normed_time
# <2 抽取相对标准结构, `Nov. 1st, 1925`
normed_time = self._normalize_rule_2(str_time)
if normed_time:
self._init_pub()
return normed_time
# <3 抽取分割时间描述, `in the autumn of 1923`
normed_time = self._normalize_rule_3(str_time)
if normed_time:
self._init_pub()
return normed_time
# <5 单个的周几或者季节描述, 'in Summer'
normed_time = self._normalize_rule_5(str_time)
if normed_time:
self._init_pub()
return normed_time
self._init_pub()
return None
@staticmethod
def _normalize_rule_0(str_time):
"""
完全标准的结构进行转换
例如:`20020101`, `2002-01-01`, `2002-01-01:05`
:param str_time: str: 抽取出来的时间
:return:
"""
str_time = str_time.replace('-', '').replace(':', '')
if not str_time.isdigit():
# 只允许存在`:`和`-`两种非数字字符
return
if not len(str_time) == 8 or len(str_time) == 10:
return
_year = str_time[0: 4]
_month = str_time[4: 6]
_day = str_time[6: 8]
_hour = str_time[8: 10] if len(str_time) == 10 else None
return StandardTimePoint(std_y=_year, std_m=_month, std_d=_day, std_h=_hour)
def _normalize_rule_1(self, str_time):
"""
抽取十年单位
例如:in the 90s, in the early 1920s
:param str_time: str: 抽取出来的时间
:return:
"""
rule_1 = u"the [0-9]0s"
pattern_1 = re.compile(rule_1)
match = pattern_1.findall(str_time)
if len(match):
# 匹配到'the 90s'
match_str = match[0]
year_ty = match_str[-3]
if year_ty > self.pub_time.std_y[-2]:
# 大于新闻时间所在年份,则认定为上个世纪,否则默认本世纪
year_thou = '19'
else:
year_thou = '20'
if 'early' in str_time:
std_y_s = year_thou + year_ty + '0'
std_y_e = year_thou + year_ty + '4'
elif 'late' in str_time:
std_y_s = year_thou + year_ty + '5'
std_y_e = year_thou + year_ty + '9'
else:
std_y_s = year_thou + year_ty + '0'
std_y_e = year_thou + year_ty + '9'
return StandardTimePeriod(std_y_s=std_y_s, std_y_e=std_y_e)
else:
rule_2 = u"the [0-9]{3}0s"
pattern_2 = re.compile(rule_2)
match = pattern_2.findall(str_time)
if len(match):
# 匹配到'the 1900s'
match_str = match[0]
std_y_s = match_str[4: 8]
std_y_e = match_str[4: 7] + '9'
return StandardTimePeriod(std_y_s=std_y_s, std_y_e=std_y_e)
else:
rule_3 = u"in early [1-2][0-9]{2}0s"
pattern_3 = re.compile(rule_3)
match = pattern_3.findall(str_time)
if len(match):
match_str = match[0]
std_y_s = match_str[-5: -1]
std_y_e = match_str[-5: -2] + '5'
return StandardTimePeriod(std_y_s=std_y_s, std_y_e=std_y_e)
rule_4 = u"in late [1-2][0-9]{2}0s"
pattern_4 = re.compile(rule_4)
match = pattern_4.findall(str_time)
if len(match):
match_str = match[0]
std_y_s = match_str[-5: -2] + '6'
std_y_e = match_str[-5: -2] + '9'
return StandardTimePeriod(std_y_s=std_y_s, std_y_e=std_y_e)
rule_5 = u"in [1-2][0-9]{2}0s"
pattern_5 = re.compile(rule_5)
match = pattern_5.findall(str_time)
if len(match):
match_str = match[0]
std_y_s = match_str[-5: -1]
std_y_e = match_str[-5: -2] + '9'
return StandardTimePeriod(std_y_s=std_y_s, std_y_e=std_y_e)
return
def _normalize_rule_2(self, str_time):
"""
抽取相对标准日期结构
例如,`Nov. 1st, 1925`
:param str_time: str: 抽取出来的时间
:return:
"""
str_time = self.pre_format(str_time)
_, _year, _month, _day, _hour, _split, __ = self._get_time_parts(str_time)
if _split:
# 如果有分割信息,则交给rule 3处理
return
if not _month:
return
elif _day:
# 如果明确了日期,返回时间点
if len(_day) == 1:
_day = '0' + _day
if len(_month) == 1:
_month = '0' + _month
return StandardTimePoint(std_y=_year, std_m=_month, std_d=_day, std_h=_hour)
else:
# 如果没有明确日期,返回时间片段
return StandardTimePeriod(std_y_s=_year, std_y_e=_year, std_m_s=_month, std_m_e=_month)
def _normalize_rule_3(self, str_time):
"""
抽取模糊表述
例如:`in the autumn of 1923`, `in early April`, `at morning of the 7th.`
:param str_time:
:return:
"""
str_time = self.pre_format(str_time)
_prep, _year, _month, _day, _hour, _split, _weekday = self._get_time_parts(str_time)
if _hour:
# 如果最小单位是小时,直接返回时间点
return StandardTimePoint(_year, _month, _day, _hour, self.pub_time)
if _weekday:
# 如果最小单位是星期几,则先找对应的星期,再对当日进行划分
if _split and _split in self.d_sp:
_hour_s, _hour_e = self.d_sp[_split]
_weekday_num = None
for i, (wd_l, wd_s) in enumerate(zip(self.weekday_long_string, self.weekday_short_string)):
if _weekday.lower() in [wd_l.lower(), wd_s.lower()]:
_weekday_num = i + 1 if i != 6 else 0 # 0对应周天,1对应周一
if not _weekday_num:
raise ValueError('{} not found.'.format(_weekday))
week_day_point = self._get_week_day(_weekday_num)
week_day = StandardTimePeriod(std_y_s=week_day_point.std_y, std_m_s=week_day_point.std_m,
std_d_s=week_day_point.std_d, std_h_s=_hour_s,
std_y_e=week_day_point.std_y, std_m_e=week_day_point.std_m,
std_d_e=week_day_point.std_d, std_h_e=_hour_e)
return week_day
elif _day:
# 如果最小单位是天,根据是否有分割描述,返回相应的时间片段或时间点
if _split and _split in self.d_sp:
# 如果有分割,则按照时间片段处理
_hour_s, _hour_e = self.d_sp[_split]
if not _month:
_month = self.pub_time.std_m
return StandardTimePeriod(std_y_s=_year, std_m_s=_month, std_d_s=_day, std_h_s=_hour_s,
std_y_e=_year, std_m_e=_month, std_d_e=_day, std_h_e=_hour_e,
pub_time=self.pub_time)
else:
# 如果没有分割描述,精确到天的时间按照时间点处理
return StandardTimePoint(_year, _month, _day, None, self.pub_time)
if _month:
# 如果最小单位是月,则返回时间片段
if _split and _split in self.m_sp:
_day_s, _day_e = self.m_sp[_split]
return StandardTimePeriod(std_y_s=_year, std_m_s=_month, std_d_s=_day_s,
std_y_e=_year, std_m_e=_month, std_d_e=_day_e,
pub_time=self.pub_time)
else:
return StandardTimePeriod(std_y_s=_year, std_m_s=_month, std_d_s='01',
std_y_e=_year, std_m_e=_month, std_d_e=self.num_days_of_every_month[_month],
pub_time=self.pub_time)
if _year:
# 如果最小单位是年,则返回时间片段
if _split and _split in self.y_sp:
_month_s, _month_e = self.y_sp[_split]
return StandardTimePeriod(std_y_s=_year, std_m_s=_month_s,
std_y_e=_year, std_m_e=_month_e,
pub_time=self.pub_time)
else:
return StandardTimePeriod(std_y_s=_year, std_y_e=_year, pub_time=self.pub_time)
if not _year and _split in self.y_sp:
# 如果只说了是这年的什么时间,而没有明确年,则默认基准时间年
_month_s, _month_e = self.y_sp[_split]
return StandardTimePeriod(std_y_s=self.pub_time.std_y, std_m_s=_month_s,
std_y_e=self.pub_time.std_y, std_m_e=_month_e,
pub_time=self.pub_time)
return
def _normalize_rule_4(self, str_time):
"""
描述性推断:描述词 + 时间
例如:`this Wednesday`, `last month`, `past 2 weeks`
:param str_time:
:return:
TODO: two days ago
"""
str_time = self.__replace_str_num(str_time)
for word in self.present_words + self.previous_words + self.future_words:
# 第一遍循环, 匹配"last 2 days"结构
if word in str_time.title():
# 先尝试找两位数的
rule = u"{} [0-9][0-9]".format(word)
pattern = re.compile(rule)
matched = pattern.findall(str_time.title())
if len(matched):
matched = matched[0]
else:
# 如果没有找到两位数的,再找一位数的
rule_2 = u"{} [0-9]".format(word)
pattern = re.compile(rule_2)
matched = pattern.findall(str_time.title())
if len(matched):
matched = matched[0]
else:
continue
tense_word, num_word = matched.split()
num_word = int(num_word)
assert tense_word == word
if tense_word in self.present_words:
continue
elif tense_word in self.previous_words:
tense = -1
else:
tense = 1
for time_span in self.delta_dict:
# 判断时间词是年还是月还是日
if time_span in str_time.title():
if self.delta_dict[time_span]['y']:
# 如果以年为单位计
_delta_years = num_word * self.delta_dict[time_span]['y']
if tense == -1:
# 如果是过去到现在, 例如`in past two years`
return StandardTimePeriod(std_y_s=str(int(self.pub_time.std_y)-_delta_years),
std_m_s=self.pub_time.std_m, std_d_s=self.pub_time.std_d,
std_y_e=self.pub_time.std_y, std_m_e=self.pub_time.std_m,
std_d_e=self.pub_time.std_d)
else:
# 如果是现在到将来,例如`in next two years`
return StandardTimePeriod(std_y_s=self.pub_time.std_y, std_m_s=self.pub_time.std_m,
std_d_s=self.pub_time.std_d,
std_y_e=str(int(self.pub_time.std_y)+_delta_years),
std_m_e=self.pub_time.std_m, std_d_e=self.pub_time.std_d)
elif self.delta_dict[time_span]['m']:
# 如果以月为单位计
_delta_months = num_word * self.delta_dict[time_span]['m']
_delta_years = 0
while _delta_months >= 12:
# 超过一年的换算成年
_delta_years += 1
_delta_months -= 12
if tense == -1:
return StandardTimePeriod(std_y_s=str(int(self.pub_time.std_y)-_delta_years),
std_m_s=str(int(self.pub_time.std_m)-_delta_months),
std_d_s=self.pub_time.std_d, std_y_e=self.pub_time.std_y,
std_m_e=self.pub_time.std_m, std_d_e=self.pub_time.std_d)
elif tense == 1:
return StandardTimePeriod(std_y_s=self.pub_time.std_y, std_m_s=self.pub_time.std_m,
std_d_s=self.pub_time.std_d,
std_y_e=str(int(self.pub_time.std_y)+_delta_years),
std_m_e=str(int(self.pub_time.std_m)+_delta_months),
std_d_e=self.pub_time.std_d)
else:
# 如果是以日为单位计
# 这里其实可以改成_get_time_delta_days, 但是我懒得改了
_delta_days = num_word * self.delta_dict[time_span]['d'] * tense
_delta_days = timedelta(days=_delta_days)
datetime_ = datetime(year=int(self.pub_time.std_y), month=int(self.pub_time.std_m),
day=int(self.pub_time.std_d))
target_datetime_ = datetime_ + _delta_days
# 补零
_m = str(datetime_.month) if datetime_.month > 9 else '0' + str(datetime_.month)
_tar_m = str(target_datetime_.month) if target_datetime_.month > 9 \
else '0' + str(target_datetime_.month)
_d = str(datetime_.day) if datetime_.day > 9 else '0' + str(datetime_.day)
_tar_d = str(target_datetime_.day) if target_datetime_.day > 9 \
else '0' + str(target_datetime_.day)
if tense == -1:
return StandardTimePeriod(std_y_s=str(target_datetime_.year),
std_m_s=_tar_m,
std_d_s=_tar_d,
std_y_e=str(datetime_.year),
std_m_e=_m,
std_d_e=_d)
else:
return StandardTimePeriod(std_y_s=str(datetime_.year),
std_m_s=_m,
std_d_s=_d,
std_y_e=str(target_datetime_.year),
std_m_e=_tar_m,
std_d_e=_tar_d)
for word in self.present_words + self.previous_words + self.future_words:
# 第二遍循环, 匹配"last year"结构
if word in str_time.title():
if word in self.present_words:
tense = 0
elif word in self.previous_words:
tense = -1
else:
tense = 1
for time_span in self.delta_dict:
if time_span in str_time.title():
if self.delta_dict[time_span]['y']:
# 如果以年为单位计
if tense == -1:
# 如果是过去到现在, 例如`last year`
if time_span.lower() == 'year':
return StandardTimePeriod(std_y_s=str(int(self.pub_time.std_y)-1),
std_y_e=str(int(self.pub_time.std_y)-1))
elif time_span.lower() == 'decade':
return self._get_decade_span(str(int(self.pub_time.std_y)-1))
elif time_span.lower() == 'century':
return self._get_century_span(str(int(self.pub_time.std_y)-1))
else:
pass
elif tense == 0:
# 如果是当前m, 例如`this year`
if time_span.lower() == 'year':
return StandardTimePeriod(std_y_s=self.pub_time.std_y,
std_y_e=self.pub_time.std_y)
elif time_span.lower() == 'decade':
return self._get_decade_span(self.pub_time.std_y)
elif time_span.lower() == 'century':
return self._get_century_span(self.pub_time.std_y)
else:
pass
else:
# 如果是现在到将来,例如`next decade`
if time_span.lower() == 'year':
return StandardTimePeriod(std_y_s=str(int(self.pub_time.std_y)+1),
std_y_e=str(int(self.pub_time.std_y+1)))
elif time_span.lower() == 'decade':
return self._get_decade_span(str(int(self.pub_time.std_y)+1))
elif time_span.lower() == 'century':
return self._get_century_span(str(int(self.pub_time.std_y)+1))
else:
pass
elif self.delta_dict[time_span]['m']:
# 如果以月为单位计
if tense == -1:
if time_span.lower() == 'month':
# 考虑推算到去年的情况
if int(self.pub_time.std_m)-1 <= 0:
_year = str(int(self.pub_time.std_y)-1)
_month = str(12 + int(self.pub_time.std_m)-1)
if len(_month) == 1:
_month = '0' + _month
else:
_year = self.pub_time.std_y
_month = str(int(self.pub_time.std_m)-1)
if len(_month) == 1:
_month = '0' + _month
return StandardTimePeriod(std_y_s=_year, std_m_s=_month,
std_y_e=_year, std_m_e=_month)
elif time_span.lower() == 'quarter':
return self._get_quarter_span(year=self.pub_time.std_y,
month=self.pub_time.std_m)
else:
pass
elif tense == 0:
if time_span.lower() == 'month':
return StandardTimePeriod(std_m_s=self.pub_time.std_m,
std_m_e=self.pub_time.std_m)
elif time_span.lower() == 'quarter':
return self._get_quarter_span(year=self.pub_time.std_y,
month=self.pub_time.std_m)
else:
pass
if tense == 1:
if time_span.lower() == 'month':
# 考虑推算到明年的情况
if int(self.pub_time.std_m) + 1 >= 0:
_year = str(int(self.pub_time.std_y) + 1)
_month = str(int(self.pub_time.std_m) + 1 - 12)
else:
_year = self.pub_time.std_y
_month = str(int(self.pub_time.std_m) + 1)
return StandardTimePeriod(std_y_s=_year, std_m_s=_month,
std_y_e=_year, std_m_e=_month)
elif time_span.lower() == 'quarter':
return self._get_quarter_span(year=self.pub_time.std_y,
month=self.pub_time.std_m)
else:
# 如果是以日为单位计
if tense == -1:
if time_span.lower() == 'week':
last_week_time_point = self._get_time_delta_days(7, 'backward')
return self._get_week_span(year=last_week_time_point.std_y,
month=last_week_time_point.std_m,
day=last_week_time_point.std_d)
elif time_span.lower() == 'fortnight':
# 如果是过去的两周则直接向前推算14天
return self._get_time_delta_days(14, direction='backward')
elif time_span.lower() == 'day':
# last day 这种说法似乎不太合理
return self._get_time_delta_days(1, direction='backward')
else:
pass
if tense == 0:
if time_span.lower() == 'week':
return self._get_week_span(year=self.pub_time.std_y,
month=self.pub_time.std_m,
day=self.pub_time.std_d)
elif time_span.lower() == 'fortnight':
# 没有这种说法
pass
elif time_span.lower() == 'day':
return StandardTimePoint(self.pub_time.std_y,
self.pub_time.std_m,
self.pub_time.std_d)
else:
pass
if tense == 1:
if time_span.lower() == 'week':
next_week_time_point = self._get_time_delta_days(7, 'forward')
return self._get_week_span(year=next_week_time_point.std_y,
month=next_week_time_point.std_m,
day=next_week_time_point.std_d)
elif time_span.lower() == 'fortnight':
return self._get_time_delta_days(14, direction='forward')
elif time_span.lower() == 'day':
return self._get_time_delta_days(1, direction='forward')
else:
pass
for week_day in self.weekday_long_string:
# `this Sunday`
if week_day in str_time.title():
if tense == -1:
# `last Monday`
last_week_time_point = self._get_time_delta_days(7, 'backward')
return self._get_week_day(self._weekday_to_num(week_day), last_week_time_point)
elif tense == 0:
# `this Sunday`
return self._get_week_day(self._weekday_to_num(week_day))
else:
next_week_time_point = self._get_time_delta_days(7, 'forward')
return self._get_week_day(self._weekday_to_num(week_day), next_week_time_point)
for season in ['Spring', 'Summer', 'Autumn', 'Winter']:
# `this Summer`
if season in str_time.title():
if season != 'Winter':
if tense == -1:
return StandardTimePeriod(std_y_s=str(int(self.pub_time.std_y)-1),
std_m_s=self.y_sp[season][0],
std_y_e=str(int(self.pub_time.std_y)-1),
std_m_e=self.y_sp[season][1])
elif tense == 0:
return StandardTimePeriod(std_m_s=self.y_sp[season][0],
std_m_e=self.y_sp[season][1])
else:
return StandardTimePeriod(std_y_s=str(int(self.pub_time.std_y)+1),
std_m_s=self.y_sp[season][0],
std_y_e=str(int(self.pub_time.std_y)+1),
std_m_e=self.y_sp[season][1])
else:
if tense == -1:
return StandardTimePeriod(std_y_s=str(int(self.pub_time.std_y)-1),
std_m_s=self.y_sp[season][0],
std_y_e=self.pub_time.std_y,
std_m_e=self.y_sp[season][1])
elif tense == 0:
# 如果当前时间是冬天,则this winter指的是去年底到今年初
# 如果当前时间不是冬天,则this winter指的是今年底到明年初
if 3 <= int(self.pub_time.std_m) < 12:
return StandardTimePeriod(std_y_s=self.pub_time.std_y,
std_m_s=self.y_sp[season][0],
std_y_e=str(int(self.pub_time.std_y)+1),
std_m_e=self.y_sp[season][1])
else:
return StandardTimePeriod(std_y_s=str(int(self.pub_time.std_y)-1),
std_m_s=self.y_sp[season][0],
std_y_e=self.pub_time.std_y,
std_m_e=self.y_sp[season][1])
else:
return StandardTimePeriod(std_y_s=self.pub_time.std_y,
std_m_s=self.y_sp[season][0],
std_y_e=str(int(self.pub_time.std_y)+1),
std_m_e=self.y_sp[season][1])
return
def _normalize_rule_5(self, str_time):
"""
星期、季节推断
rule4中包含了一部分星期,这里的星期是单个的星期,不存在描述词,所以在4之后
`in Summer`, `on Tuesday`
:param str_time:
:return:
"""
for week_day in self.weekday_short_string + self.weekday_long_string:
if week_day in str_time.title():
if week_day in self.weekday_short_string:
w_word = self._weekday_short_to_long(week_day)
else:
w_word = week_day
return self._get_week_day(self._weekday_to_num(w_word))
for season in ['Spring', 'Summer', 'Autumn', 'Winter']:
if season in str_time.title():
if season != 'Winter':
return StandardTimePeriod(std_m_s=self.y_sp[season][0],
std_m_e=self.y_sp[season][1])
else:
if 3 <= int(self.pub_time.std_m) < 12:
return StandardTimePeriod(std_y_s=self.pub_time.std_y,
std_m_s=self.y_sp[season][0],
std_y_e=str(int(self.pub_time.std_y) + 1),
std_m_e=self.y_sp[season][1])
else:
return StandardTimePeriod(std_y_s=str(int(self.pub_time.std_y) - 1),
std_m_s=self.y_sp[season][0],
std_y_e=self.pub_time.std_y,
std_m_e=self.y_sp[season][1])
str_time = self.pre_format(str_time)
for specific_day in ['Today', 'Yesterday', 'Tomorrow']:
if specific_day in str_time.title():
if specific_day == 'Today':
_tar_day = self.pub_time
elif specific_day == 'Yesterday':
_tar_day = self._get_time_delta_days(1, direction='backward')
else:
_tar_day = self._get_time_delta_days(1, direction='forward')
for dsp in self.d_sp:
# 一天之内的分割词,例如morning
# 如果匹配到,按照时间片段算,否则按照时间点算作某一天
if dsp in str_time.title():
return StandardTimePeriod(std_y_s=_tar_day.std_y, std_m_s=_tar_day.std_m,
std_d_s=_tar_day.std_d, std_h_s=self.d_sp[dsp][0],
std_y_e=_tar_day.std_y, std_m_e=_tar_day.std_m,
std_d_e=_tar_day.std_d, std_h_e=self.d_sp[dsp][1])
return StandardTimePoint(std_y=_tar_day.std_y,
std_m=_tar_day.std_m,
std_d=_tar_day.std_d)
if 'Tonight' in str_time.title():
return StandardTimePeriod(std_y_s=self.pub_time.std_y, std_m_s=self.pub_time.std_m,
std_d_s=self.pub_time.std_d, std_h_s='18',
std_y_e=self.pub_time.std_y, std_m_e=self.pub_time.std_m,
std_d_e=self.pub_time.std_d, std_h_e='24')
return
@staticmethod
def pre_format(str_time):
"""
预处理格式化
"""
while ' ' in str_time:
str_time = str_time.replace(' ', ' ')
str_time = str_time.title()
str_time = str_time.replace('1St', '01').replace('2Nd', '02').replace('3Rd', '03').replace('4Th', '04')\
.replace('5Th', '05').replace('6Th', '06').replace('7Th', '07').replace('8Th', '08').replace('9Th', '09')
str_time = str_time.replace(' Quarter', '+Quarter').replace('Mid ', 'Mid+') # 用加号防止split将结构切开
str_time = str_time.replace('St ', ' ').replace('Nd ', ' ').replace('Rd ', ' ').replace('Th ', ' ')
str_time = str_time.replace('First', '01').replace('Second', '02').replace('Third', '03')\
.replace('Fourth', '04').replace('Fifth', '05').replace('Sixth', '06').replace('Seventh', '07')\
.replace('Eighth', '08').replace('Ninth', '09').replace('Tenth', '10')
return str_time
def _get_time_parts(self, str_time):
"""
对时间切片得到各个成分
"""
_prep, _year, _month, _day, _hour, _split, _weekday = None, None, None, None, None, None, None
time_parts = str_time.split()
for part in time_parts:
if part in self.preps:
_prep = part
elif part.title().replace('.', '').replace(',', '') in self.month_long_string + self.month_short_string:
_month = part.title().replace('.', '').replace(',', '')
if _month in self.month_short_string:
_month = self._month_short_to_long(_month)
_month = self._month_to_num(_month)
elif part.replace(',', '') in ['0' + str(i) if i < 10 else str(i) for i in range(1, 32)] + \
[str(i) for i in range(1, 32)]:
_day = part.replace(',', '')
if len(_day) == 1:
_day = '0' + _day
elif '1000' < part < '2100':
_year = part
elif part in self.y_sp or part in self.m_sp or part in self.d_sp:
_split = part
elif part.title().replace('.', '').replace(',', '') in self.weekday_long_string + self.weekday_short_string:
_weekday = part.title().replace('.', '').replace(',', '')
if _day:
assert not _weekday, Exception('If there have been a specific day, a weekday is unexpected.')
return _prep, _year, _month, _day, _hour, _split, _weekday
def _month_short_to_long(self, short_month):
short_month = short_month.title()
month_short_to_long_dict = {k: v for k, v in zip(self.month_short_string, self.month_long_string)}
return month_short_to_long_dict[short_month]
def _month_to_num(self, month):
month_to_num_dict = {m: str(i + 1) if i >= 9 else '0' + str(i + 1) for i, m in
enumerate(self.month_long_string)}
return month_to_num_dict[month.title()]
def _weekday_short_to_long(self, short_weekday):
short_weekday = short_weekday.title()
weekday_short_to_long_dict = {k: v for k, v in zip(self.weekday_short_string, self.weekday_long_string)}
return weekday_short_to_long_dict[short_weekday]
def _weekday_to_num(self, weekday):
for i, w in enumerate(self.weekday_long_string):
if weekday == w:
if weekday == "Sunday":
return 0
else:
return i + 1
raise ValueError("{} is not a standard weekday.".format(weekday))
@staticmethod
def _get_decade_span(year):
"""
给定一年,获取这一年所在的十年的span
Example: (2021) --> [(2020, 01, 01), (2029, 12, 31)]
:param year: str
:return: StandardTimePeriod
"""
_year_s = year[:3] + '0'
_year_e = year[:3] + '9'
return StandardTimePeriod(std_y_s=_year_s, std_y_e=_year_e)
@staticmethod
def _get_century_span(year):
"""
给定一年,获取这一年所在的世纪的span
:param year: str
:return: StandardTimePeriod
"""
_year_s = year[:2] + '00'
_year_e = year[:2] + '99'
return StandardTimePeriod(std_y_s=_year_s, std_y_e=_year_e)
@staticmethod
def _get_quarter_span(year, month):
"""
给定一个月,获取这个月所在季度的span
:param year: str
:param month: str
:return: StandardTimePeriod
"""
if int(month) in [1, 2, 3]:
_month_s = '01'
_month_e = '03'
elif int(month) in [4, 5, 6]:
_month_s = '04'
_month_e = '06'
elif int(month) in [7, 8, 9]:
_month_s = '07'
_month_e = '09'
elif int(month) in [10, 11, 12]:
_month_s = '10'
_month_e = '12'
else:
raise Exception()
return StandardTimePeriod(std_y_s=year, std_m_s=_month_s, std_y_e=year, std_m_e=_month_e)
@staticmethod
def _get_week_span(year, month, day):
"""
给定一个日期,获取这一天所在的一周的span
:param year: str
:param month: str
:param day: str
:return: StandardTimePeriod
"""
this_day = datetime(int(year), int(month), int(day))
this_weekday = this_day.weekday()
if this_weekday == 6:
# 如果是周天,要调整到第一天
this_weekday = 0
else:
this_weekday += 1
_time_delta_fore = timedelta(days=-this_weekday)
_time_delta_aftr = timedelta(days=6-this_weekday)
week_start = this_day + _time_delta_fore
week_end = this_day + _time_delta_aftr
m_s = str(week_start.month) if week_start.month > 9 else '0' + str(week_start.month)
m_e = str(week_end.month) if week_end.month > 9 else '0' + str(week_end.month)
d_s = str(week_start.day) if week_start.day > 9 else '0' + str(week_start.day)
d_e = str(week_end.day) if week_end.day > 9 else '0' + str(week_end.day)
return StandardTimePeriod(std_y_s=str(week_start.year), std_m_s=m_s, std_d_s=d_s,
std_y_e=str(week_start.year), std_m_e=m_e, std_d_e=d_e)
def _get_time_delta_days(self, days, direction='backward', base_time=None):
"""
在基准日期的基础上推算几天前后几天后的日期
:param days: int: 推算几天
:param direction: str: 前向还是后项, backward or forward
:param base_time: StandardTimePeriod
:return: StandardTimePoint
"""
assert direction in ['backward', 'forward'], Exception("Direction must be `forward` or `backward`.")
if not base_time:
base_time = self.pub_time
if direction == 'backward':
_delta = timedelta(days=-days)
else:
_delta = timedelta(days=days)
_dt = datetime(int(base_time.std_y), int(base_time.std_m), int(base_time.std_d))
_tar_dt = _dt + _delta
_tar_y = str(_tar_dt.year)
_tar_m = str(_tar_dt.month) if _tar_dt.month > 9 else '0' + str(_tar_dt.month)
_tar_d = str(_tar_dt.day) if _tar_dt.day > 9 else '0' + str(_tar_dt.day)
return StandardTimePoint(std_y=_tar_y, std_m=_tar_m, std_d=_tar_d)
def _get_week_day(self, weekday, base_week_time=None):
"""
获取基准时间所在周的周几
:param weekday: int: 0: Sunday; 6: Saturday
:return: StandardTimePoint
"""
while weekday >= 7:
weekday -= 7
if not base_week_time:
base_week_time = self.pub_time
week = self._get_week_span(base_week_time.std_y, base_week_time.std_m, base_week_time.std_d)
tar_day = datetime(int(week.std_y_s), int(week.std_m_s), int(week.std_d_s)) + timedelta(days=weekday)
_year = str(tar_day.year)
_month = str(tar_day.month) if tar_day.month > 9 else '0' + str(tar_day.month)
_day = str(tar_day.day) if tar_day.day > 9 else '0' + str(tar_day.day)
return StandardTimePoint(std_y=_year, std_m=_month, std_d=_day)
@staticmethod
def __num_to_int(num):
"""
0-99的数字转换,英文描述或字符串,转整型
:param num: str
:return:
"""
nums = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen']
num_to_int = {n: i for i, n in enumerate(nums)}
nums_ty = ['twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
num_to_int_ty = {n: (i + 2) * 10 for i, n in enumerate(nums_ty)}
try:
return int(num)
except:
try:
if '-' in num:
num_1, num_2 = num.split('-') # 如果对不上,则跳转finally
return num_to_int_ty[num_1.lower().replace(' ', '')] + num_to_int[num_2.lower().replace(' ', '')]
else:
return num_to_int[num.lower().replace(' ', '')]
except:
return 0
def __replace_str_num(self, text):
"""
如果字符串中英文单词描述的数字,则将其替换成数字
例如: 'seven days' --> '7 days'
"""
for k in self.num_str_to_int:
text = text.replace(k, str(self.num_str_to_int[k]))
return text
总之,我这个时间标准化工具写的不怎么完备,思想也非常朴素,但是它的的确确可以解决一部分时间标准化的问题(如果你有更好用的标准化工具那当我没说)。
如果这个工具对你有所帮助,记得点赞支持哦。