爬虫---时间转换模块

实际爬虫项目中,会爬取上万的网站,这么多网站在抓取数据的过程中如何统一它们的格式是一个很大的问题。这些网站的时间格式千奇百怪,各种语言都有,所以为了方便大多数网站,写了这个日期格式转换的脚本

并不足以百分百解决全部网站,但是足以解决大多数网站,

不足之处,根据项目的需要修改吧

全部代码

import datetime
import re
import time
import logging
from dateutil.parser import parse
from dateutil.relativedelta import relativedelta
from datetime import timezone

date_dict = {'刚刚': 0, '剛剛':0,'今天': 0, '今日': 0, '昨天': 1, '昨日': 1, '前天': 2, '前日': 2}
replace_dict = {'年': '-', '月': '-', '日': ' ', '时': ':','時': ':', '点': ':', '點':':','分': ':', '秒': ' ', '.': '-', '上午': ' ',
                '下午': ' '}
date_before_dict = {'年': relativedelta(years=1), '月': relativedelta(months=1), '个月': relativedelta(months=1),
                    '周': datetime.timedelta(days=7), '星期': datetime.timedelta(days=7), '天': datetime.timedelta(days=1),
                    '日': datetime.timedelta(days=1), '时': datetime.timedelta(hours=1),
                    '小时': datetime.timedelta(hours=1),'時': datetime.timedelta(hours=1),
                    '小時': datetime.timedelta(hours=1),
                    '分': datetime.timedelta(minutes=1), '分钟': datetime.timedelta(minutes=1),'分鍾': datetime.timedelta(minutes=1),
                    '秒': datetime.timedelta(seconds=1), '秒钟': datetime.timedelta(seconds=1), '秒鍾': datetime.timedelta(seconds=1)}



def dealstring(date_time):
    date_time=date_time.strip()
    date_time = date_time.upper()
    if "AGO" in date_time:
        try:
            date_time = date_time.replace('MINUTES', "分").replace('MINUTE', "分")
            date_time = date_time.replace("HOURS", "小时").replace("HOUR", "小时")
            date_time = date_time.replace("DAYS", "天").replace("DAY", "天")
            date_time = date_time.replace("WEEKS", "周").replace("WEEK", "周")
            date_time = date_time.replace("MONTHS", "月").replace("MONTH", "月")
            date_time = date_time.replace("YEARS", "年").replace("YEAR", "年").replace(" ", "")
        except:
            date_time=date_time
        try:
            date_time = date_time.replace("AGO", "前").replace(" ","")
        except:
            date_time=date_time
        try:
            pass
        except:
            date_time=date_time

    try:
        date_time.split("atnaujinta".upper())[1].strip()
    except:
        date_time=date_time
    try:
        date_time=date_time.split("PUBLISHED:")[1].strip().replace(".","")
    except:
        date_time = date_time
    try:
        date_time=date_time.split("UPDATED:")[1].strip().replace(".","")
    except:
        date_time = date_time
    try:
        ss = re.findall("[.](.*?)Z$", date_time)
        sp = "." + ss[0] + "Z"
        date_time = date_time.strip(sp)
    except:
        date_time = date_time

    try:
        ss = re.findall("[.](.*?)UTC$", date_time)
        sp = "." + ss[0] + "Z"
        date_time = date_time.strip(sp)
    except:
        date_time = date_time
    try:
        date_time = date_time.split("(")[0]
    except:
        date_time = date_time
    try:
        date_dict = {
            "一月": "01", "二月": "02", "三月": "03", "四月": "04", "五月": "05", "六月": "06", "七月": "07",
            "八月": "08", "九月": "09", "十月": "10", "十一": "11", "十二": "12", "十一月": "11", "十二月": "12"
        }
        result = re.findall(r'[\u4e00-\u9fa5]+', date_time)[0]
        for i in range(len(date_dict.items())):
            if result == list(date_dict.keys())[i]:
                date_time = date_time.replace(result, list(date_dict.values())[i])
            else:
                continue
    except:
        date_time = date_time
    return date_time



def getDateTime(date_time):
    '''
    :param date_time: 传入的时间参数,必须是字符串
    :return: 字典 {'timestamp': int类型时间戳, 'datetime': str类型的最终的时间格式(%Y-%m-%d %H:%M:%S)}
    '''
    if not isinstance(date_time, str):
        date_time = str(date_time)
    date_time=dealstring(date_time)
    try:
        if date_time.isdigit() and len(date_time) >= 10:
            if len(date_time) == 10:
                return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(date_time)))
            elif len(date_time) == 13:
                return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(date_time) / 1000))
            else:
                return ''

        if date_time == '':
            return ''

        # 小时结尾后面要加上一个数字
        if date_time[-1] == '点' or date_time[-1] == '时':
            date_time += '0'

        # 判断为24小时制还是12小时制
        ths = '24'
        if '下午' in date_time:
            ths = '12'

        # 处理时间爱你格式20/0903
        if date_time.count('/') == 1 and len(date_time) == 7:
            date_time = date_time.replace('/', '')

        # **时间单位前
        if date_time[-1] == '前' or date_time[-2:] == '之前':
            d_date_time = DateTimebefore(date_time).strftime("%Y-%m-%d %H:%M:%S")
            return d_date_time

        # 前天、今天、昨天类型的时间处理
        for key, value in date_dict.items():
            timestamp = time.time()
            oneday = datetime.timedelta(days=1)
            # 只有前、昨、今天
            if date_time == key:
                nowtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp))
                d_date_time = (datetimeConversion(nowtime, ths) - oneday * value).strftime("%Y-%m-%d %H:%M:%S")
                return d_date_time

            # 后面加了时间的
            if key in date_time and len(date_time) > len(key):
                nowtime = time.strftime("%Y-%m-%d", time.localtime(timestamp))
                d_date_time = (datetimeConversion(nowtime) - oneday * value).strftime("%Y-%m-%d")

                date_time = re.sub(key, d_date_time + ' ', date_time)

                d_date_time = datetimeConversion(date_time, ths).strftime("%Y-%m-%d %H:%M:%S")
                return d_date_time

        d_date_time = datetimeConversion(date_time, ths).strftime("%Y-%m-%d %H:%M:%S")
    except Exception as e:
        logging.info(f'-时间解析出错--{date_time}-{repr(e)}')
        return ''
    return d_date_time



# 日期**时间单位之前转成datetime类型
def DateTimebefore(date_time):
    '''
    :param date_time: str类型时间
    :return: 返回datetime类型的最终时间
    '''
    date_time = date_time.replace('前', '').replace('之', '')
    nowtime = datetime.datetime.today()
    # 分离时间和单位
    num = ''
    for i in date_time:
        if i.isdigit():
            num += i
        else:
            break
    unit = date_time.split(num)[-1]
    # 生成最终时间
    d_date_time = nowtime - date_before_dict[unit] * int(num)
    return d_date_time


# 生成日期格式
def datetimeConversion(date_time, ths='24'):
    '''
    :param date_time: 字符串类型的时间
    :param ths: 参数只能是24或12,代表24小时制或12小时制,默认为24
    :return: 返回datetime类型的时间格式 %Y-%m-%d %H:%M:%S
    '''
    s = ''

    if ':00' in date_time:
        s += '0'
    date_time = re.sub(r'\s+|星期.?', ' ', date_time)

    for key, value in replace_dict.items():
        date_time = date_time.replace(key, value)
    if date_time[-1] == ':':
        date_time = date_time[:-1]
    # 转日期格式
    date_time = parse(parse(date_time, yearfirst=True).strftime("%Y-%m-%d %H:%M:%S"), yearfirst=True)
    # 12小时制转24小时制
    if ths == '12':
        date_time += datetime.timedelta(hours=12)
    # 日期大于当前日期时,年份-1
    if date_time > datetime.datetime.today():
        date_time -= relativedelta(years=1)
    # 只有日期没有时间,添加时间
    if (date_time.hour == 0 or date_time.hour == 12) and date_time.minute == 0 and date_time.second == 0 and s == '':
        date_time = datetime.datetime(year=date_time.year, month=date_time.month, day=date_time.day,
                                      hour=datetime.datetime.today().hour, minute=datetime.datetime.today().minute,
                                      second=datetime.datetime.today().second)
    return date_time


# 日期转时间戳
def timestampConversion(date_time):
    '''
    时间(str类型)转时间戳
    :param date_time: 输入的时间
    :return: 返回str类型的时间戳
    '''
    d_date_time = getDateTime(date_time)
    if d_date_time == '':
        return ''
    return int(time.mktime(time.strptime(d_date_time, "%Y-%m-%d %H:%M:%S")))


if __name__ == '__main__':
    timea = time.time()
    t = "1 year ago"
    a = getDateTime(t)
    print(time.time() - timea)
    print(a)

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值