在前几天爬网站时,请求过多会封ip,所以想到通过网上抓取免费的代理,并检测代理有效性后,使用高匿名代理去爬取需要的内容,并可在被封时切换新的代理来爬取内容。
使用到的库:urllib2
函数如下
检测代理ip是否可用
import urllib2
def CheckProxyIsCorrect(proxy_ip, proxy_port):
proxy_support = urllib2.ProxyHandler({'http': r'http://' + proxy_ip + ':' + proxy_port})
opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
url = 'http://httpbin.org/ip'
try:
response = opener.open(url, timeout=5)
html = response.read()
if proxy_ip in html:
return 1
else:
return 0
except urllib2.HTTPError, e:
print('CheckProxyIsCorrect:', proxy_ip, ' HTTPError:', e)
return 0
except urllib2.URLError, e:
print('CheckProxyIsCorrect:', proxy_ip, ' URLError:', e)
return 0
except Exception as e:
print('CheckProxyIsCorrect:', proxy_ip, ' Exception:', e)
return 0
使用代理ip提交GET请求
import urllib2
def UseProxyToGet(proxy_ip, proxy_port):
proxy_support = urllib2.ProxyHandler({'http': r'http://' + proxy_ip + ':' + proxy_port})
opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
try:
url = "http://httpbin.org/get?a=1&b=2"
opener.addheaders = [
('Cookie', 'this is cookie'),
('Referer', 'this is refere'),
('User-Agent', 'this is user-agent'),
]
response = opener.open(url, timeout=5)
html = response.read()
print html
except urllib2.HTTPError as e:
print('UseProxyToGet:', proxy_ip, ' HTTPError:', e)
except urllib2.URLError as e:
print('UseProxyToGet:', proxy_ip, ' URLError:', e)
except Exception as e:
print('UseProxyToGet:', proxy_ip, ' Exception:', e)
使用代理ip提交POST请求
import urllib, urllib2
def UseProxyToPost(proxy_ip, proxy_port):
proxy_support = urllib2.ProxyHandler({'http': r'http://' + proxy_ip + ':' + proxy_port})
opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
try:
url = "http://httpbin.org/post"
opener.addheaders = [
('Cookie', 'this is cookie'),
('Referer','this is refere'),
('User-Agent', 'this is user-agent'),
]
values = {'a': '1', 'b': '2'}
data = urllib.urlencode(values)
response = opener.open(url, data, timeout=5)
html = response.read()
print html
except urllib2.HTTPError, e:
print('UseProxyToPost:', proxy_ip, ' HTTPError:', e)
except urllib2.URLError as e:
print('UseProxyToPost:', proxy_ip, ' URLError:', e)
except Exception as e:
print('UseProxyToPost:', proxy_ip, ' Exception:', e)