爬虫-Day5-登录和IP代理
1.requests登录反爬
import requests
"""
1)用谷歌浏览器打开需要爬取的网站,人工完成登录
2)获取这个网站登录后的cookie信息:
检查 -> Network -> All -> 网站域名 -> Headers -> RequestHeader -> cookie
3)在headers对应的字典中添加键值对: 'cookie': 获取到的cookie信息
"""
headers = {
'cookie': '_zap=bccd72c4-94da-4aad-b6f4-c4a076540641; d_c0="APCfiebWwxSPTmf4t0eyATFtTgFHdTZJ_sk=|1649519947"; _xsrf=7RrxkbJCk4djlHfCY5NfKwhhm5IDMyOD; __snaker__id=ncPyNsuTImwpYs5U; gdxidpyhxdE=R63k63BTZgZmXMVXEoiGKqKXyt%5C%2Fwv%5CicGl9ILEMdLjgbXj7nk27VJCDgVByWgL2E9%5C81w5II3sRI%2BLlrU%5CNujzOp%2FwpBDxMoiUCttYM9TUr%5C%2BQqMfbqqZJBEnfpo9CWB%2FuKHMltgaSI1NYgTxXcR3WmZ5ZsoHtDeaKKogUenjN9E8Lv%3A1652758223246; _9755xjdesxxd_=32;
}
response = requests.get('https://www.zhihu.com/', headers=headers)
print(response.text)
2.selenium登录反爬
from selenium.webdriver import Chrome
from json import dumps
b = Chrome()
b.get('https://www.taobao.com')
input('登录完成:')
cookies = b.get_cookies()
with open('files/taobao.json', 'w', encoding='utf-8') as f:
f.write(dumps(cookies))
b.close()
b = Chrome()
b.get('https://www.taobao.com')
with open('files/taobao.json', encoding='utf-8') as f:
cookies = loads(f.read())
for x in cookies:
b.add_cookie(x)
b.get('https://www.taobao.com')
input('end:')
b.close()
3.requests使用代理
import requests
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
proxies = {
'https': '183.165.224.25:4554',
'http': '183.165.224.25:4554'
}
response = requests.get('https://www.maoyan.com/', headers=headers, proxies=proxies)
response.encoding = 'utf-8'
print(response.text)
4.使用代理的实际用法
import requests
import time
from bs4 import BeautifulSoup
def get_ip():
"""
获取代理ip,如果获取失败过2秒再重新获取
:return: 获取到的ip地址
"""
while True:
response = requests.get('http://d.jghttp.alicloudecs.com/getip?num=1&type=1&pro=510000&city=510600&yys=0&port=1&time=2&ts=0&ys=0&cs=0&lb=4&sb=0&pb=4&mr=1®ions=')
result = response.text
if result[0] == '{':
print('ip获取失败')
time.sleep(2)
else:
return result
def get_net_data():
url = 'https://www.maoyan.com/'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
while True:
ip = get_ip()
print(ip)
proxy = {
'https': ip
}
response = requests.get(url, headers=headers, proxies=proxy)
response.encoding = 'utf-8'
print(response.text)
soup = BeautifulSoup(response.text, 'lxml')
movies_div = soup.select('.movie-list .movie-item')
if len(movies_div) == 0:
continue
else:
print('爬虫成功!做后续的解析操作')
break
if __name__ == '__main__':
get_net_data()
5.selenium使用代理
from selenium.webdriver import Chrome, ChromeOptions
options = ChromeOptions()
options.add_argument('--proxy-server=http://115.208.231.37:4545')
b = Chrome(options=options)
b.get('https://www.maoyan.com/')
print(b.page_source)
input('end:')
b.close()