python3 日期解析问题

# -*- coding:utf-8 -*-

"""FaceBook 发布时间解析规则"""
import re
import time
import dateparser
from loguru import logger
import traceback


print('hello world')

hour = r'\d{1,2}'
minute = r'\d{1,2}'
period = r'AM|PM|'
month = (
    r"Jan(?:uary)?|"
    r"Feb(?:ruary)?|"
    r"Mar(?:ch)?|"
    r"Apr(?:il)?|"
    r"May|"
    r"Jun(?:e)?|"
    r"Jul(?:y)?|"
    r"Aug(?:ust)?|"
    r"Sep(?:tember)?|"
    r"Oct(?:ober)?|"
    r"Nov(?:ember)?|"
    r"Dec(?:ember)?"
)
day_of_month = r"\d{1,2}"
specific_date_md = f'(?:{month}) {day_of_month}' + r'(?:,? \d{4})?'
specific_date_dm = f'{day_of_month} (?:{month})' + r'(?:,? \d{4})?'
date = f'{specific_date_md}|{specific_date_dm}|Today|Yesterday'
exact_time = f"(?:{date}) at {hour}:{minute} ?(?:{period})"

relative_time_hours = '^\d{1,2}\s?h(?:rs?)?$'
relative_time_minutes = '^\d{1,2}\s?m(?:ins?)?$'
relative_time_days = '^\d{1,2}\s?d(?:ays?)?$'
relative_time_weeks = '^\d{1,2}\s?wk)$'
relative_time_months = '^\d{1,2}\s?(?:mth|mo)$'
relative_time_years = '^\d{1,2}\s?yr$'
relative_time = f'{relative_time_years}|{relative_time_months}|{relative_time_days}|{relative_time_hours}|{relative_time_minutes}|{relative_time_weeks}'


hours_test_case_list = ['19 h', '19h', '19 hr', '19 hrs']
minutes_test_case_list = ['10m', '10 m', '10 mins', '1 min']
days_test_case_list = ['1 d', '2d']

def test_hours():
    """测试hours解析规则"""
    for test_case in hours_test_case_list:
        hours_com = re.compile(relative_time_hours, re.IGNORECASE)
        print('hour_com:', hours_com)
        test_case_res = re.match(relative_time_hours, test_case)
        print('test_case_res:', test_case_res)
        if test_case_res:
            print(f'test_case:{test_case}--hours_result:{test_case_res.group(1)}')

def test_minutes():
    """测试minutes解析规则"""
    for test_case in minutes_test_case_list:
        minute_com = re.compile(relative_time_minutes, re.IGNORECASE)
        print('minute_com:', minute_com)
        test_case_res = re.match(relative_time_minutes, test_case)
        print('test_case_res:', test_case_res)
        if test_case_res:
            print(f'test_case:{test_case}--minutes_result:{test_case_res.group(1)}')

def test_days():
    """测试minutes解析规则"""
    for test_case in days_test_case_list:
        day_com = re.compile(relative_time_days, re.IGNORECASE)
        print('day_com:', day_com)
        test_case_res = re.match(relative_time_days, test_case)
        print('test_case_res:', test_case_res)
        if test_case_res:
            print(f'test_case:{test_case}--days_result:{test_case_res.group(1)}')



def get_publish_date():
    # publish_date
    result = dict()
    publish_time = int(time.time() * 1000)  # 发布时间 (如果未解析出发布时间默认会设定为当前时间)
    test_case_list = ['19h  ', '19 h', '10m', '10 m', '10 mins', '1 d', '2d', '10 hr', '11 hrs', 'December 4 at 11:46 pm',
        'December 4 at 11:46 am', 'February 16, 2013', 'Yesterday at 04:33', '21 December at 02:23', '1 September', '21 October 2020', 'January 6', 'December 27, 2021 at 1:02 AM']
    for test_case in test_case_list:
        publish_time_str = test_case.strip()

        if publish_time_str:
            publish_time_str = publish_time_str.strip()
            logger.info(f'[INFO]publish_time:{publish_time_str}')
            hour = r'\d{1,2}'
            minute = r'\d{1,2}'
            period = r'AM|PM|'
            month = (
                r"Jan(?:uary)?|"
                r"Feb(?:ruary)?|"
                r"Mar(?:ch)?|"
                r"Apr(?:il)?|"
                r"May|"
                r"Jun(?:e)?|"
                r"Jul(?:y)?|"
                r"Aug(?:ust)?|"
                r"Sep(?:tember)?|"
                r"Oct(?:ober)?|"
                r"Nov(?:ember)?|"
                r"Dec(?:ember)?"
            )
            day_of_month = r"\d{1,2}"
            specific_date_md = f'(?:{month}) {day_of_month}' + r'(?:,? \d{4})?'
            specific_date_dm = f'{day_of_month} (?:{month})' + r'(?:,? \d{4})?'
            date = f'{specific_date_md}|{specific_date_dm}|Today|Yesterday'
            exact_time = f"(?:{date}) at {hour}:{minute} ?(?:{period})"
            exact_date = f'{date}'
            relative_time_hours = r'^\d{1,2}\s?h(?:rs?)?$'
            relative_time_minutes = r'^\d{1,2}\s?m(?:ins?)?$'
            relative_time_days = r'^\d{1,2}\s?d(?:ays?)?$'
            relative_time_weeks = r'^\d{1,2}\s?wk$'
            relative_time_months = r'^\d{1,2}\s?(?:mth|mo)$'
            relative_time_years = r'^\d{1,2}\s?yr$'
            print('relative_time_hours_com:', re.compile(relative_time_hours))
            print('relative_time_minutes_com:', re.compile(relative_time_minutes))
            print('relative_time_days_com:', re.compile(relative_time_days))
            print('relative_time_weeks_com:', re.compile(relative_time_weeks))
            print('relative_time_years_com:', re.compile(relative_time_years))
            print('relative_time_months:', re.compile(relative_time_months))
        
            relative_time = f'{relative_time_years}|{relative_time_months}|{relative_time_days}|{relative_time_hours}|{relative_time_minutes}|{relative_time_weeks}'
            datetime_regex = re.compile(fr"({exact_time}|{relative_time}|{exact_date})", re.IGNORECASE)
            time_match = datetime_regex.search(publish_time_str)
            try:
                if time_match:
                    date_str = time_match.group(0).replace("mth", "month")
                else:
                    date_str = publish_time_str
                    logger.info('[INFO]未能匹配发布日期')
                    logger.warning(f'[WARNING]未能匹配发布日期:{date_str}')
                date_time = dateparser.parse(date_str)
                publish_time = int(date_time.timestamp() * 1000)
            except Exception as e:
                logger.error(f'[ERROR]解析发布时间异常{traceback.format_exc()}')
        result['publish_time'] = publish_time
        logger.info(f'[INFO]publish_time: {publish_time}')
    return result

if __name__ == '__main__':
    # test_hours()
    # test_minutes()
    # test_days()
    # parse_date()
    get_publish_date()

使用dateparser模块去解析日期字符串,结果可以输出为日期字符串或者时间戳;

参考项目:

        facebook_scraper

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值