Python构建代理池

使用Python爬取网站内容的时候,容易受反爬虫的限制,所以我们使用IP代理。稳定IP代理的都花钱,所以用免费代理构建自己的代理池。

免费的IP代理
http://www.xicidaili.com

使用的模块

  1. import requests
  2. import threading
  3. import random
  4. from pyquery import PyQuery as pq

爬取的网站

http://www.xicidaili.com

主要分为4个步骤

  1. 获取ip(使用多线程)
  2. 爬取网站上的ip
  3. 检验ip的有效性
  4. 创建多个headers

完整代码

import requests
import threading
import random
from pyquery import PyQuery as pq

IP_list = []
Target_url = 'https://ip.cn'     # 验证ip的地址

def getheaders():
    user_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    UserAgent = random.choice(user_list)
    headers = {'User-Agent':UserAgent}
    return headers


def findip(type, pagenum):
    """
    :param type: ip类型
    :param pagenum: 页码
    :return:
    """
    list = {
        '1': 'http://www.xicidaili.com/nt/',  # 国内普通代理
        '2': 'http://www.xicidaili.com/nn/',  # 国内高匿代理
        '3': 'http://www.xicidaili.com/wn/',  # 国内https代理
        '4': 'http://www.xicidaili.com/wt/'  # 国外http代理
    }
    url = list[str(type)] + str(pagenum)  # 配置url
    headers = getheaders()
    html = requests.get(url=url, headers=headers, timeout=5).text
    doc = pq(html)
    all = doc('#ip_list tr')
    for i in all.items():
        sumtd = i.find('td').text()
        result = sumtd.split(' ')
        if len(result) > 1:
            get_ip = result[1] + ':' + result[2]
            is_ok = checkip(get_ip)
            if is_ok == True:
                print(get_ip)
                IP_list.append(get_ip)

def getip():    #获取ip
    threads = []
    for type in range(4):  # 四种ip类型,每种类型取前三页
        for pagenum in range(3):
            t = threading.Thread(target=findip, args=(type + 1, pagenum + 1))
            threads.append(t)
    print('开始爬取代理ip')
    for s in threads:   # 使用多线程爬取
        s.start()
    for e in threads:
        s.join()
    print('爬取完成')


def checkip(ip):    # 检验ip的有效性
    headers = getheaders()      #获取随机headers
    proxies = {'http': f'http://{ip}', 'https': f'http://{ip}'}  # 代理ip
    try:
        response = requests.get(url=Target_url, proxies=proxies, headers=headers, timeout=1).status_code
        if response == 200:
            return True
        else:
            return False
    except:
        return False


if __name__ == '__main__':
    getip()
    with open('result.txt', 'w') as f:
        for IP in IP_list:
            f.writelines(IP + '\n')
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值