#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
# @Time : 2019/12/10 17:26
# @Author : mason.tang
import requests
"""
功能: 爬取百度指数结果
搜索入口地址:
http://index.baidu.com/v2/index.html#/
"""
month_day_dict = {
"01": 31,
"02": 28,
"03": 31,
"04": 30,
"05": 31,
"06": 30,
"07": 31,
"08": 31,
"09": 30,
"10": 31,
"11": 30,
"12": 31,
}
def generateMonthDays(month_day_dict, year='2017', month='03'):
'''
生成指定年份、月份中的所有日期
'''
day_num = month_day_dict[month]
day_date_list = []
for i in range(1, day_num + 1):
one = str(i)
if len(one) == 1:
one = '0' + one
day_date_list.append(year + '-' + month + '-' + one)
return day_date_list
def generatePeriodDays(start="2019-11-01", end="2019-11-31"):
"""
生成指定时间段内所有的 天 日期
"""
start_year, start_mon = start.split("-")[0].strip(), start.split("-")[1].strip()
all_day_list = generateMonthDays(month_day_dict, year=start_year, month=start_mon)
end_year, end_mon = start.split("-")[0].strip(), start.split("-")[1].strip()
all_day_list += generateMonthDays(month_day_dict, year=end_year, month=end_mon)
day_list = [one for one in all_day_list if one >= start and one <= end]
return sorted(list(set(day_list)))
def reqData(start="2019-11-01", end="2019-12-03", kw="海康威视"):
'''
网络请求数据
url编码
https://index.baidu.com/api/NewsApi/checkNewsIndex?dates%5B%5D=2019-11-16%2C2019-11-23%2C2019-11-27%2C2019-12-05&type=day&words=%E8%BF%BD%E6%88%91%E5%90%A7
url解码
https://index.baidu.com/api/NewsApi/checkNewsIndex?dates[]=2019-11-16,2019-11-23,2019-11-27,2019-12-05&type=day&words=追我吧
from urllib import parse
parse.quote(url)
'''
urlTemplate="https://index.baidu.com/api/NewsApi/checkNewsIndex?dates[]={}&type=day&words={}"
date_list = generatePeriodDays(start=start, end=end)
date_str = ','.join(date_list)
search_url = urlTemplate.format(date_str, kw)
print(date_list,date_str,search_url)
data = requests.get(search_url,headers=headers,timeout=5)
content = data.json()
return content
def dataParser(content):
'''
数据解析
'''
data_list = content['data']["追我吧"]
result = []
key_list = ['date', 'title', 'url', 'source', 'same_news']
for one_dict in data_list:
T = one_dict['date']
news_list = one_dict['news']
for one_son_dict in news_list:
one_tmp_list = []
for i in range(len(key_list)):
try:
one_tmp_list.append(one_son_dict[key_list[i]])
except:
one_tmp_list.append(None)
result.append(one_tmp_list)
result = sorted(result, key=lambda e: e[0])
result.insert(0, key_list)
for one in result:
print(one)
if __name__ == '__main__':
content = reqData(start="2019-11-11", end="2019-12-11", kw="追我吧")
print(content)
dataParser(content)
ps:由于需要登录,导致获取不成功,时间有限,请参考以下链接:
https://www.jianshu.com/p/86f66f997c3d
https://www.cnblogs.com/fnng/p/6431484.html