目标网址
https://mp.csdn.net/postlist
一,请求头带上cookie数据
先写一个模块
spidertool.py
# 爬虫的请求处理模块
def trans_head_2_dict(head):
"""接收字符串,转换为字典"""
res = dict()
head_list = head.split(":")
res[head_list[0]] = head_list[1].strip()
return res
import requests
import spidertool
# 请求网址
url = "https://mp.csdn.net/postlist"
# UA
ua_str = """user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"""
ua_dict = spidertool.trans_head_2_dict(ua_str)
# cookie
cookie_str = 自己cookie
cookie_dict = spidertool.trans_head_2_dict(cookie_str)
# headers
headers ={}
headers.update(cookie_dict)
headers.update(ua_dict)
print(headers)
# 发起请求
res = requests.get(url, headers = headers)
con = res.content.decode()
with open('a.html', 'w', encoding='utf8') as f:
f.write(con)