站在巨人的肩膀上。
想要获取法定节假日时,找到了这位大佬的博客。
https://blog.csdn.net/joson1234567890/article/details/106214810
但是问题在于,该网站进行了更新,之前的代码无法运行,所以我这里做了部分修改。
不过这次需要我们手动打开网页获取一些数据,再进行api的调用
- 网页链接 https://wannianrili.bmcx.com/
- 右键打开开发者工具
- 右边选择网络
- 切换日历查询的月份或者年份,可以看到网络下面有更新
5. 查看cookie,并且填写到下面的代码中
!!!代码需要有cookie,请按照图示方式获取cookie并且填写,完毕之后就可以运行了
代码如下
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: Joson
# @DateTime: 2020-05-19 10:25
# @Description: https://wannianrili.51240.com/
# @Version: 1.0
import csv
import requests
from lxml import etree
from hyper.contrib import HTTP20Adapter
class WanNianRiLi(object):
"""万年日历接口数据抓取
Params:year 四位数年份字符串
"""
def __init__(self, year):
self.year = year
data = self.parseHTML()
print(data)
# self.exportTxt(data)
self.exportCSV(data)
def parseHTML(self):
"""页面解析"""
url = 'https://wannianrili.51240.com/ajax/'
s = requests.session()
headers = {
':authority': 'wannianrili.bmcx.com',
':method': 'GET',
':scheme': 'https',
# TODO
# 注意,这里进行了更新,我们需要查看该网页的cookie,见上面的说明
'cookie': '',
'referer': 'https://wannianrili.bmcx.com/2021-12-01__wannianrili/',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': 'Windows',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
'Accept': '*/*',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
}
result = []
# 生成月份列表
dateList = [self.year + '-' + '%02d' % i for i in range(1, 13)]
for year_month in dateList:
s = requests.session()
'''
这里也进行了更新,新增了适配器,用来新增 :开头的header,比如:authority,:method
'''
s.mount('https://wannianrili.bmcx.com/ajax/', HTTP20Adapter())
url = 'https://wannianrili.bmcx.com/ajax/'
payload = {'q': year_month, 'v': 20031914}
'''
注意这里也进行了更新,需要新增path了。
'''
headers[':path'] = '/ajax/?q=' + year_month + '&v=20031914'
response = s.get(url, headers=headers, params=payload)
element = etree.HTML(response.text)
html = element.xpath('//div[@class="wnrl_riqi"]')
print('In Working:', year_month)
for _element in html:
# 获取节点属性
item = _element.xpath('./a')[0].attrib
if 'class' in item:
if item['class'] == 'wnrl_riqi_xiu':
tag = '休假'
'''
在原作者基础上,新增了周六日
'''
elif item['class'] == 'wnrl_riqi_mo':
tag = '周六日'
elif item['class'] == 'wnrl_riqi_ban':
tag = '补班'
else:
pass
_span = _element.xpath('.//text()')
result.append({'Date': year_month + '-' + _span[0], 'Holiday': _span[1], 'Tag': tag})
print(result)
return result
def exportCSV(self, data):
"""导出CSV"""
headers = ['Date', 'Holiday', 'Tag']
# 如果存入乱码,添加 encoding='utf-8-sig'
with open(self.year + 'Holiday.csv', 'w', newline='')as f:
f_csv = csv.DictWriter(f, headers)
f_csv.writeheader()
f_csv.writerows(data)
def exportTxt(self, data):
with open('./' + self.year + '-holiday.txt', 'w') as f:
for d in data:
f.write(d + '\n')
f.close()
if __name__ == '__main__':
rili = WanNianRiLi('2022')