实际爬虫项目中,会爬取上万的网站,这么多网站在抓取数据的过程中如何统一它们的格式是一个很大的问题。这些网站的时间格式千奇百怪,各种语言都有,所以为了方便大多数网站,写了这个日期格式转换的脚本
并不足以百分百解决全部网站,但是足以解决大多数网站,
不足之处,根据项目的需要修改吧
全部代码
import datetime
import re
import time
import logging
from dateutil.parser import parse
from dateutil.relativedelta import relativedelta
from datetime import timezone
date_dict = {'刚刚': 0, '剛剛':0,'今天': 0, '今日': 0, '昨天': 1, '昨日': 1, '前天': 2, '前日': 2}
replace_dict = {'年': '-', '月': '-', '日': ' ', '时': ':','時': ':', '点': ':', '點':':','分': ':', '秒': ' ', '.': '-', '上午': ' ',
'下午': ' '}
date_before_dict = {'年': relativedelta(years=1), '月': relativedelta(months=1), '个月': relativedelta(months=1),
'周': datetime.timedelta(days=7), '星期': datetime.timedelta(days=7), '天': datetime.timedelta(days=1),
'日': datetime.timedelta(days=1), '时': datetime.timedelta(hours=1),
'小时': datetime.timedelta(hours=1),'時': datetime.timedelta(hours=1),
'小時': datetime.timedelta(hours=1),
'分': datetime.timedelta(minutes=1), '分钟': datetime.timedelta(minutes=1),'分鍾': datetime.timedelta(minutes=1),
'秒': datetime.timedelta(seconds=1), '秒钟': datetime.timedelta(seconds=1), '秒鍾': datetime.timedelta(seconds=1)}
def dealstring(date_time):
date_time=date_time.strip()
date_time = date_time.upper()
if "AGO" in date_time:
try:
date_time = date_time.replace('MINUTES', "分").replace('MINUTE', "分")
date_time = date_time.replace("HOURS", "小时").replace("HOUR", "小时")
date_time = date_time.replace("DAYS", "天").replace("DAY", "天")
date_time = date_time.replace("WEEKS", "周").replace("WEEK", "周")
date_time = date_time.replace("MONTHS", "月").replace("MONTH", "月")
date_time = date_time.replace("YEARS", "年").replace("YEAR", "年").replace(" ", "")
except:
date_time=date_time
try:
date_time = date_time.replace("AGO", "前").replace(" ","")
except:
date_time=date_time
try:
pass
except:
date_time=date_time
try:
date_time.split("atnaujinta".upper())[1].strip()
except:
date_time=date_time
try:
date_time=date_time.split("PUBLISHED:")[1].strip().replace(".","")
except:
date_time = date_time
try:
date_time=date_time.split("UPDATED:")[1].strip().replace(".","")
except:
date_time = date_time
try:
ss = re.findall("[.](.*?)Z$", date_time)
sp = "." + ss[0] + "Z"
date_time = date_time.strip(sp)
except:
date_time = date_time
try:
ss = re.findall("[.](.*?)UTC$", date_time)
sp = "." + ss[0] + "Z"
date_time = date_time.strip(sp)
except:
date_time = date_time
try:
date_time = date_time.split("(")[0]
except:
date_time = date_time
try:
date_dict = {
"一月": "01", "二月": "02", "三月": "03", "四月": "04", "五月": "05", "六月": "06", "七月": "07",
"八月": "08", "九月": "09", "十月": "10", "十一": "11", "十二": "12", "十一月": "11", "十二月": "12"
}
result = re.findall(r'[\u4e00-\u9fa5]+', date_time)[0]
for i in range(len(date_dict.items())):
if result == list(date_dict.keys())[i]:
date_time = date_time.replace(result, list(date_dict.values())[i])
else:
continue
except:
date_time = date_time
return date_time
def getDateTime(date_time):
'''
:param date_time: 传入的时间参数,必须是字符串
:return: 字典 {'timestamp': int类型时间戳, 'datetime': str类型的最终的时间格式(%Y-%m-%d %H:%M:%S)}
'''
if not isinstance(date_time, str):
date_time = str(date_time)
date_time=dealstring(date_time)
try:
if date_time.isdigit() and len(date_time) >= 10:
if len(date_time) == 10:
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(date_time)))
elif len(date_time) == 13:
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(date_time) / 1000))
else:
return ''
if date_time == '':
return ''
# 小时结尾后面要加上一个数字
if date_time[-1] == '点' or date_time[-1] == '时':
date_time += '0'
# 判断为24小时制还是12小时制
ths = '24'
if '下午' in date_time:
ths = '12'
# 处理时间爱你格式20/0903
if date_time.count('/') == 1 and len(date_time) == 7:
date_time = date_time.replace('/', '')
# **时间单位前
if date_time[-1] == '前' or date_time[-2:] == '之前':
d_date_time = DateTimebefore(date_time).strftime("%Y-%m-%d %H:%M:%S")
return d_date_time
# 前天、今天、昨天类型的时间处理
for key, value in date_dict.items():
timestamp = time.time()
oneday = datetime.timedelta(days=1)
# 只有前、昨、今天
if date_time == key:
nowtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp))
d_date_time = (datetimeConversion(nowtime, ths) - oneday * value).strftime("%Y-%m-%d %H:%M:%S")
return d_date_time
# 后面加了时间的
if key in date_time and len(date_time) > len(key):
nowtime = time.strftime("%Y-%m-%d", time.localtime(timestamp))
d_date_time = (datetimeConversion(nowtime) - oneday * value).strftime("%Y-%m-%d")
date_time = re.sub(key, d_date_time + ' ', date_time)
d_date_time = datetimeConversion(date_time, ths).strftime("%Y-%m-%d %H:%M:%S")
return d_date_time
d_date_time = datetimeConversion(date_time, ths).strftime("%Y-%m-%d %H:%M:%S")
except Exception as e:
logging.info(f'-时间解析出错--{date_time}-{repr(e)}')
return ''
return d_date_time
# 日期**时间单位之前转成datetime类型
def DateTimebefore(date_time):
'''
:param date_time: str类型时间
:return: 返回datetime类型的最终时间
'''
date_time = date_time.replace('前', '').replace('之', '')
nowtime = datetime.datetime.today()
# 分离时间和单位
num = ''
for i in date_time:
if i.isdigit():
num += i
else:
break
unit = date_time.split(num)[-1]
# 生成最终时间
d_date_time = nowtime - date_before_dict[unit] * int(num)
return d_date_time
# 生成日期格式
def datetimeConversion(date_time, ths='24'):
'''
:param date_time: 字符串类型的时间
:param ths: 参数只能是24或12,代表24小时制或12小时制,默认为24
:return: 返回datetime类型的时间格式 %Y-%m-%d %H:%M:%S
'''
s = ''
if ':00' in date_time:
s += '0'
date_time = re.sub(r'\s+|星期.?', ' ', date_time)
for key, value in replace_dict.items():
date_time = date_time.replace(key, value)
if date_time[-1] == ':':
date_time = date_time[:-1]
# 转日期格式
date_time = parse(parse(date_time, yearfirst=True).strftime("%Y-%m-%d %H:%M:%S"), yearfirst=True)
# 12小时制转24小时制
if ths == '12':
date_time += datetime.timedelta(hours=12)
# 日期大于当前日期时,年份-1
if date_time > datetime.datetime.today():
date_time -= relativedelta(years=1)
# 只有日期没有时间,添加时间
if (date_time.hour == 0 or date_time.hour == 12) and date_time.minute == 0 and date_time.second == 0 and s == '':
date_time = datetime.datetime(year=date_time.year, month=date_time.month, day=date_time.day,
hour=datetime.datetime.today().hour, minute=datetime.datetime.today().minute,
second=datetime.datetime.today().second)
return date_time
# 日期转时间戳
def timestampConversion(date_time):
'''
时间(str类型)转时间戳
:param date_time: 输入的时间
:return: 返回str类型的时间戳
'''
d_date_time = getDateTime(date_time)
if d_date_time == '':
return ''
return int(time.mktime(time.strptime(d_date_time, "%Y-%m-%d %H:%M:%S")))
if __name__ == '__main__':
timea = time.time()
t = "1 year ago"
a = getDateTime(t)
print(time.time() - timea)
print(a)