最近在抓取豆瓣电影信息,但是请求太过频繁后,豆瓣后台会封掉请求IP,导致请求403,查了一圈资料,发现可以使用代理IP进行访问,代理IP的获取网址为:http://www.xicidaili.com/ 获取代理IP后,通过搜狗网页进行了简单的验证。
import time
import requests
import re
###请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/51.0.2704.63 Safari/537.36',
}
url="http://www.xicidaili.com/wn/"
url_num=1 ##设置页数 url_num * 100
###抓去代理IP
proxy_ip_pool=[]
for iurl in range(1,url_num+1):
proxy_html = requests.get(url+str(iurl),headers=headers).text
proxy_html=proxy_html.split("\n")
for line in range(len(proxy_html)):
tmp_ip=""
ip = re.findall(r'\d+.\d+.\d+.\d+', proxy_html[line])
if ip != [] and "." in str(ip):
port=proxy_html[line+1].split(">")[1].split("<")[0]
tmp_ip=str(ip[0])+":"+str(port)
if tmp_ip != "":
proxy_ip_pool.append(tmp_ip)
print(len(proxy_ip_pool))
###验证代理IP是否可用
proxy_ip_pool_test=[]
test_url="https://www.sogou.com/" ##验证网址:搜狗
for itest in proxy_ip_pool:
proxies = {}
proxies["https"]="https://"+itest #使用代理IP
try:
proxy_test = requests.get(test_url, headers=headers,proxies=proxies,timeout=5)
print(proxy_test)
proxy_ip_pool_test.append(proxies["https"])
except:
print("error proxy ip")
print(len(proxy_ip_pool_test))
t=time.strftime('%Y-%m-%d_%H_%M_%S',time.localtime(time.time()))
f1 = open("C:/bz/"+"proxy_ip_"+t+".txt",'w')
for line in proxy_ip_pool_test:
f1.write(line+"\n")
f1.close()