使用Python多线程获取快代理IP

最新推荐文章于 2024-10-18 17:51:23 发布

weixin_34226182

最新推荐文章于 2024-10-18 17:51:23 发布

阅读量352

点赞数

文章标签： javascript python 爬虫 ViewUI

原文链接：https://my.oschina.net/mickelfeng/blog/832625

版权

2019独角兽企业重金招聘Python工程师标准>>>

#!/usr/bin/env python
# coding: utf-8
import requests, urllib2
from lxml import etree
import time, threading

# Check proxy
def check_proxy(ip, port):
    # global fp
    try:
        ip = "%s:%s" % (ip, port)
        proxy_ip = {"http": ip}
        #socket.setdefaulttimeout(2)
        proxy_handler = urllib2.ProxyHandler(proxy_ip)
        # proxy = urllib2.ProxyHandler(proxy_ip)

        opener = urllib2.build_opener(proxy_handler)
        opener.addheaders = [('User-agent', user_agent)] #这句加上以后无法正常检测，不知道是什么原因。
        urllib2.install_opener(opener)

        req = urllib2.Request(ip_check_url)
        time_start = time.time()
        conn = urllib2.urlopen(req)
        # conn = urllib2.urlopen(ip_check_url)
        time_end = time.time()
        detected_pip = conn.read()

    except urllib2.HTTPError, e:
        #print "ERROR: Code ", e.code
        exit(2)
        return False
    except Exception, detail:
        #print "ERROR: ", detail
        exit(2)
        return False

    print (" WORKING: " + ip)

    #return proxy_detected
if __name__ == '__main__':
    url = "http://www.kuaidaili.com/free/outtr/1"
    headers = {'content-type': 'text/html',
               'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
               'Upgrade-Insecure-Requests':'1',
               'Host':'www.kuaidaili.com',
               'Referer':'http://www.kuaidaili.com/free/outtr/',
               'Cookie':'channelid=0; sid=1486372186965373; _ga=GA1.2.1398510836.1486259777; _gat=1; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1486259777; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1487298326',
               'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}

    ip_check_url = 'http://www.baidu.com/index.php'
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'
    socket_timeout = 10

    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()  # 如果响应状态码不是 200，就主动抛出异常
        tree = etree.HTML(r.content)

        ip_nodes = tree.xpath("//tr//td[1]/text()")
        port_nodes = tree.xpath("//tr//td[2]/text()")

        tmp = dict(zip(ip_nodes, port_nodes))
        # print tmp
        threads = []
        for (ip, port) in tmp.items():
            # proxy_detected = check_proxy(ip, port)
            t = threading.Thread(target=check_proxy, args=(ip, port))
            threads.append(t)

        for t in threads:
            t.start()
        for t in threads:
            t.join(10)
    except requests.RequestException as e:
        print(e)

这是几个月写的脚本获取快代理的ip。

今天是20170920，需要用国外的代理ip。今天使用这个脚本，先更新了cookie的值。

怎么还报错。

521 Server Error: for url: http://www.kuaidaili.com/free/outha/1/

使用curl命令获取。居然是js代码。真是奇了怪。

C:\Users\Administrator>curl http://www.kuaidaili.com/free/outha/1/
<html><body><script language="javascript"> window.onload=setTimeout("aw(68)", 20
0); function aw(WE) {var qo, mo="", no="", oo = [0x4d,0x69,0x04,0x61,0x96,0xa4,0
xf9,0x6f,0xc0,0xe9,0xcf,0x20,0x7d,0xca,0x18,0xda,0x18,0x5d,0xc2,0xd6,0x16,0x82,0
x20,0x15,0x8e,0x3b,0x98,0xe1,0x57,0xbc,0x0c,0x2d,0x82,0xd3,0x31,0xb4,0x2a,0x3e,0
x56,0x8e,0xd3,0x35,0x3d,0x9a,0xff,0x16,0x4e,0xb7,0xe7,0x18,0x0d,0x6a,0xcf,0xef,0