Python爬取百度指数搜索结果,查看热点信息

#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
# @Time    : 2019/12/10 17:26
# @Author  : mason.tang



import requests

"""

功能: 爬取百度指数结果
       搜索入口地址:
                    http://index.baidu.com/v2/index.html#/

"""


month_day_dict = {
    "01": 31,
    "02": 28,
    "03": 31,
    "04": 30,
    "05": 31,
    "06": 30,
    "07": 31,
    "08": 31,
    "09": 30,
    "10": 31,
    "11": 30,
    "12": 31,
}


def generateMonthDays(month_day_dict, year='2017', month='03'):
    '''
    生成指定年份、月份中的所有日期
    '''
    day_num = month_day_dict[month]
    day_date_list = []
    for i in range(1, day_num + 1):
        one = str(i)
        if len(one) == 1:
            one = '0' + one
        day_date_list.append(year + '-' + month + '-' + one)
    return day_date_list


def generatePeriodDays(start="2019-11-01", end="2019-11-31"):
    """
    生成指定时间段内所有的 天 日期
    """
    start_year, start_mon = start.split("-")[0].strip(), start.split("-")[1].strip()
    all_day_list = generateMonthDays(month_day_dict, year=start_year, month=start_mon)
    end_year, end_mon = start.split("-")[0].strip(), start.split("-")[1].strip()
    all_day_list += generateMonthDays(month_day_dict, year=end_year, month=end_mon)
    day_list = [one for one in all_day_list if one >= start and one <= end]
    return sorted(list(set(day_list)))


def reqData(start="2019-11-01", end="2019-12-03", kw="海康威视"):
    '''
    网络请求数据
    
    url编码
    https://index.baidu.com/api/NewsApi/checkNewsIndex?dates%5B%5D=2019-11-16%2C2019-11-23%2C2019-11-27%2C2019-12-05&type=day&words=%E8%BF%BD%E6%88%91%E5%90%A7

	url解码
    https://index.baidu.com/api/NewsApi/checkNewsIndex?dates[]=2019-11-16,2019-11-23,2019-11-27,2019-12-05&type=day&words=追我吧   
    
	from urllib import parse
	parse.quote(url)

 '''



    urlTemplate="https://index.baidu.com/api/NewsApi/checkNewsIndex?dates[]={}&type=day&words={}"
    date_list = generatePeriodDays(start=start, end=end)
    date_str = ','.join(date_list)
    search_url = urlTemplate.format(date_str, kw)
    print(date_list,date_str,search_url)
    data = requests.get(search_url,headers=headers,timeout=5)
    content = data.json()
    return content


def dataParser(content):
    '''
    数据解析
    '''
    data_list = content['data']["追我吧"]
    result = []
    key_list = ['date', 'title', 'url', 'source', 'same_news']
    for one_dict in data_list:
        T = one_dict['date']
        news_list = one_dict['news']
        for one_son_dict in news_list:
            one_tmp_list = []
            for i in range(len(key_list)):
                try:
                    one_tmp_list.append(one_son_dict[key_list[i]])
                except:
                    one_tmp_list.append(None)
            result.append(one_tmp_list)
    result = sorted(result, key=lambda e: e[0])
    result.insert(0, key_list)
    for one in result:
        print(one)


if __name__ == '__main__':
    content = reqData(start="2019-11-11", end="2019-12-11", kw="追我吧")
    print(content)
    dataParser(content)

ps:由于需要登录,导致获取不成功,时间有限,请参考以下链接:
https://www.jianshu.com/p/86f66f997c3d
https://www.cnblogs.com/fnng/p/6431484.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值