#!/usr/bin/env python
# coding: utf-8
import requests, urllib2
from lxml import etree
import time, threading
# Check proxy
def check_proxy(ip, port):
# global fp
try:
ip = "%s:%s" % (ip, port)
proxy_ip = {"http": ip}
#socket.setdefaulttimeout(2)
proxy_handler = urllib2.ProxyHandler(proxy_ip)
# proxy = urllib2.ProxyHandler(proxy_ip)
opener = urllib2.build_opener(proxy_handler)
opener.addheaders = [('User-agent', user_agent)] #这句加上以后无法正常检测,不知道是什么原因。
urllib2.install_opener(opener)
req = urllib2.Request(ip_check_url)
time_start = time.time()
conn = urllib2.urlopen(req)
# conn = urllib2.urlopen(ip_check_url)
time_end = time.time()
detected_pip = conn.read()
except urllib2.HTTPError, e:
#print "ERROR: Code ", e.code
exit(2)
return False
except Exception, detail:
#print "ERROR: ", detail
exit(2)
return False
print (" WORKING: " + ip)
#return proxy_detected
if __name__ == '__main__':
url = "http://www.kuaidaili.com/free/outtr/1"
headers = {'content-type': 'text/html',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Upgrade-Insecure-Requests':'1',
'Host':'www.kuaidaili.com',
'Referer':'http://www.kuaidaili.com/free/outtr/',
'Cookie':'channelid=0; sid=1486372186965373; _ga=GA1.2.1398510836.1486259777; _gat=1; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1486259777; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1487298326',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
ip_check_url = 'http://www.baidu.com/index.php'
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'
socket_timeout = 10
try:
r = requests.get(url, headers=headers)
r.raise_for_status() # 如果响应状态码不是 200,就主动抛出异常
tree = etree.HTML(r.content)
ip_nodes = tree.xpath("//tr//td[1]/text()")
port_nodes = tree.xpath("//tr//td[2]/text()")
tmp = dict(zip(ip_nodes, port_nodes))
# print tmp
threads = []
for (ip, port) in tmp.items():
# proxy_detected = check_proxy(ip, port)
t = threading.Thread(target=check_proxy, args=(ip, port))
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join(10)
except requests.RequestException as e:
print(e)
这是几个月写的脚本获取快代理的ip。
今天是20170920,需要用国外的代理ip。今天使用这个脚本,先更新了cookie的值。
怎么还报错。
521 Server Error: for url: http://www.kuaidaili.com/free/outha/1/
使用curl命令获取。居然是js代码。真是奇了怪。
C:\Users\Administrator>curl http://www.kuaidaili.com/free/outha/1/
<html><body><script language="javascript"> window.onload=setTimeout("aw(68)", 20
0); function aw(WE) {var qo, mo="", no="", oo = [0x4d,0x69,0x04,0x61,0x96,0xa4,0
xf9,0x6f,0xc0,0xe9,0xcf,0x20,0x7d,0xca,0x18,0xda,0x18,0x5d,0xc2,0xd6,0x16,0x82,0
x20,0x15,0x8e,0x3b,0x98,0xe1,0x57,0xbc,0x0c,0x2d,0x82,0xd3,0x31,0xb4,0x2a,0x3e,0
x56,0x8e,0xd3,0x35,0x3d,0x9a,0xff,0x16,0x4e,0xb7,0xe7,0x18,0x0d,0x6a,0xcf,0xef,0
现在用scrapy爬一个网站始终遇到521错误,是怎么回事呢?
请教各位,python编写爬虫,返回http error 521怎么解决
找了一个代理网站:http://www.66ip.cn/areaindex_27/1.html