python爬虫 构建自己的代理IP池

查看源码 使用xpath解析标签
在这里插入图片描述

import requests
import parsel

proxies_list = []

    url = "https://www.kuaidaili.com/free/"
    hander = {"User-Agent": "Mozilla/5.0"}
    r = requests.get(url, headers=hander, timeout=30)
    data = r.text
    #print(data)
    html_data = parsel.Selector(data)
    tr_parse = html_data.xpath(
        '//table[@class="table table-bordered table-striped"]/tbody/tr')
    for tr in tr_parse:
        proxies_dict = {}
        http_type = tr.xpath('./td[4]/text()').extract_first()
        ip = tr.xpath('./td[1]/text()').extract_first()
        ip_port = tr.xpath('./td[2]/text()').extract_first()
        proxies_dict[http_type] = ip + ':' + ip_port
        proxies_list.append(proxies_dict)

    print(proxies_list)


def check_ip(proxies_list):#检查IP的质量 
    hander = {"User-Agent": "Mozilla/5.0"}
    can_use = []
    for ip in proxies_list:
        try:
            response = requests.get('http://www.baidu.com',
                                    headers=hander,
                                    timeout=0.1)#如果超过0.12秒没反应则抛弃
            if response.status_code == 200:
                can_use.append(ip)
        except Exception as e:
            print(ip, e)

    return can_use


print(check_ip(proxies_list)) #输出高质量ip
{'HTTP': '125.94.44.129:1080'} HTTPConnectionPool(host='www.baidu.com', port=80): Read timed out. (read timeout=0.1)
[{'HTTP': '60.190.250.120:8080'}, {'HTTP': '118.112.195.91:9999'}, {'HTTP': '110.243.5.163:9999'}, {'HTTP': '118.89.91.108:8888'}, {'HTTP': '125.122.199.13:9000'}, {'HTTP': '171.11.28.248:9999'}, {'HTTP': '211.152.33.24:39406'}, {'HTTP': '59.62.35.130:9000'}, {'HTTP': '123.163.96.124:9999'}, {'HTTP': '125.117.135.10:9000'}, {'HTTP': '175.44.108.164:9999'}, {'HTTP': '110.243.15.228:9999'}, {'HTTP': '1.193.245.47:9999'}, {'HTTP': '59.62.24.87:9000'}]

使用代理ip池来访问:

 proxies_list = []
    proxy = [{
        'HTTP': '60.190.250.120:8080'
    }, {
        'HTTP': '118.112.195.91:9999'
    }, {
        'HTTP': '110.243.5.163:9999'
    }, {
        'HTTP': '118.89.91.108:8888'
    }, {
        'HTTP': '125.122.199.13:9000'
    }, {
        'HTTP': '171.11.28.248:9999'
    }, {
        'HTTP': '211.152.33.24:39406'
    }, {
        'HTTP': '59.62.35.130:9000'
    }, {
        'HTTP': '123.163.96.124:9999'
    }, {
        'HTTP': '125.117.135.10:9000'
    }, {
        'HTTP': '175.44.108.164:9999'
    }, {
        'HTTP': '110.243.15.228:9999'
    }, {
        'HTTP': '1.193.245.47:9999'
    }, {
        'HTTP': '59.62.24.87:9000'
    }]
    for a in range(1,5):
        url = "https://www.kuaidaili.com/free/inha/"+str(a)+"/"
        hander = {"User-Agent": "Mozilla/5.0"}
        for i in proxy:
            r = requests.get(url, headers=hander, timeout=1, proxies=i)
            if r.status_code == 200:
                html = r.text
                
                html_parsel_data = parsel.Selector(html)
                tr_parse = html_parsel_data.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
                for tr in tr_parse:
                    proxy_dict = {}
                    http_type = tr.xpath('./td[4]/text()').extract_first()
                    ip = tr.xpath('./td[1]/text()').extract_first()
                    ip_port = tr.xpath('./td[2]/text()').extract_first()
                    proxy_dict[http_type] = ip + ':' + ip_port
                    proxies_list.append(proxy_dict)
                    
                break
            else:
                continue

def check_ip(proxies_list):
    hander = {"User-Agent": "Mozilla/5.0"}
    can_use = []
    for ip in proxies_list:
        try:
            response = requests.get('http://www.baidu.com',
                                    headers=hander,
                                    timeout=0.1)
            if response.status_code == 200:
                can_use.append(ip)
        except Exception as e:
            print(ip, e)
    return can_use
print(check_ip(proxies_list)) 输出高质量代理IP


[{'HTTP': '175.42.128.48:9999'}, {'HTTP': '123.101.212.223:9999'}, {'HTTP': '60.190.250.120:8080'}, {'HTTP': '125.94.44.129:1080'}, {'HTTP': '118.112.195.91:9999'}, {'HTTP': '110.243.5.163:9999'}, {'HTTP': '118.89.91.108:8888'}, {'HTTP': '125.122.199.13:9000'}, {'HTTP': '171.11.28.248:9999'}, {'HTTP': '211.152.33.24:39406'}, {'HTTP': '59.62.35.130:9000'}, {'HTTP': '123.163.96.124:9999'}, {'HTTP': '125.117.135.10:9000'}, {'HTTP': '175.44.108.164:9999'}, {'HTTP': '110.243.15.228:9999'}, {'HTTP': '59.62.24.87:9000'}, {'HTTP': '113.124.93.190:9999'}, {'HTTP': '119.119.239.155:9000'}, {'HTTP': '60.13.42.157:9999'}, {'HTTP': '180.104.63.242:9000'}, {'HTTP': '175.42.68.223:9999'}, {'HTTP': '1.198.73.202:9999'}, {'HTTP': '125.108.76.226:9000'}, {'HTTP': '106.75.177.227:8111'}, {'HTTP': '124.93.201.59:42672'}, {'HTTP': '121.233.206.211:9999'}, {'HTTP': '175.44.109.104:9999'}, {'HTTP': '118.212.104.240:9999'}, {'HTTP': '163.204.240.107:9999'}, {'HTTP': '60.13.42.77:9999'}, {'HTTP': '49.89.86.30:9999'}, {'HTTP': '106.42.217.26:9999'}, {'HTTP': '115.29.170.58:8118'}, {'HTTP': '183.166.133.196:9999'}, {'HTTP': '114.223.208.165:8118'}, {'HTTP': '175.44.109.71:9999'}, {'HTTP': '163.204.244.219:9999'}, {'HTTP': '210.5.10.87:53281'}, {'HTTP': '123.101.213.137:9999'}, {'HTTP': '171.15.49.169:9999'}, {'HTTP': '1.198.72.171:9999'}, {'HTTP': '125.108.101.220:9000'}, {'HTTP': '36.250.156.85:9999'}, {'HTTP': '123.169.167.44:9999'}, {'HTTP': '123.169.167.44:9999'}, {'HTTP': '115.219.168.69:8118'}, {'HTTP': '1.199.30.73:9999'}, {'HTTP': '222.74.65.69:56210'}, {'HTTP': '110.243.26.53:9999'}, {'HTTP': '171.13.7.108:9999'}, {'HTTP': '175.43.151.48:9999'}, {'HTTP': '1.193.245.3:9999'}, {'HTTP': '163.204.240.35:9999'}, {'HTTP': '113.195.16.66:9999'}, {'HTTP': '27.43.188.27:9999'}, {'HTTP': '113.208.115.190:8118'}, {'HTTP': '125.110.100.170:9000'}, {'HTTP': '1.198.72.19:9999'}, {'HTTP': '121.232.199.174:9000'}]

xpath 语法

获取href属性和 文本
在这里插入图片描述
在这里插入图片描述

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值