# 代理ip # 默认爬虫时使用的是真实IP,为了伪装真实IP,使用代理IP。 # redis数据库是内存型数据库,及支持将数据存储到内存,也可以将数据进行持久化的本地存储。 # redis非常适合做数据的缓存。
先拿 知乎 练手, 模仿用户正常登陆知乎
# 直接将登陆成功之后的Cookie放在headers中,向页面发送请求。 url = 'https://www.zhihu.com/'
需要请求头headers
访问zhihu.com , 以正常身份登陆知乎,打开网页,F12
headers = { "Host": "www.zhihu.com", "Referer": "https://www.zhihu.com/signup?next=%2F", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0", "Cookie": 'd_c0="AGAku45jtw2PTu09Dpvhke4klei4JarodIE=|1528425319"; q_c1=a46ba33124a2403cb907a2d9105a7bd4|1528425319000|1528425319000; capsion_ticket="2|1:0|10:1528705722|14:capsion_ticket|44:NzE1YzNhZmJiZjIxNDA1MTg4ZTdkN2YyMTFiNWQwNTk=|42d9339a55b21206f1cae511940cab2468b6e201c1adce2c958be61edfadb1a0"; _zap=cc087957-a74b-43fc-a1d9-f7bd685897b7; _xsrf=8e9717b7-05a4-481c-8e82-3ca140d5b266; tgw_l7_route=156dfd931a77f9586c0da07030f2df36; z_c0="2|1:0|10:1528705732|4:z_c0|92:Mi4xRHBTMkJRQUFBQUFBWUNTN2ptTzNEU1lBQUFCZ0FsVk54SUFMWEFEMGtfeUowbzNNeXlQRjcwYXVSNV9zMHV1UXZn|27f3bcbcc0b8271658d88009ef05d80b3da8b6df2fb156a7c186a5d076047a63' }
# allow_redirects=False 禁止重定向的参数。否则,无法获取302的状态码。 response = requests.get(url, headers=headers, allow_redirects=False) print(response.status_code) # 302 print(response.text)
# 以下获得随机IP
from requests.exceptions import ConnectionError PROXY_POOL_URL = 'http://localhost:5000/get' def get_proxy(): try: response = requests.get(PROXY_POOL_URL) if response.status_code == 200: return response.text except ConnectionError: return None def get_html(): proxy = get_proxy() print(proxy) proxies = { 'http': 'http://' + proxy } try: response = requests.get('http://www.baidu.com', proxies=proxies) print(response.status_code) except: print('----') get_html()