快速构建一个免费的IP代理池

焱淼136

已于 2023-01-20 12:13:03 修改

阅读量547

点赞数

文章标签： http python

于 2023-01-20 12:10:52 首次发布

本文链接：https://blog.csdn.net/weixin_61391564/article/details/128740348

版权

文章使用的代理ip链接为:

云代理: http://www.ip3366.net/?stype=1&page=1

验证IP网站为: http://'http://httpbin.org/ip

get方式请求进去,要设置好请求头,cookie

self.faker = Faker(locale='zh_CN')
        self.headers = {
            'User-Agent': self.faker.chrome(),
            'Cookie':'Hm_lvt_c4dd741ab3585e047d56cf99ebbbe102=1667027823,1668068804; Hm_lpvt_c4dd741ab3585e047d56cf99ebbbe102=1668068825',
            'Host':'www.ip3366.net',
        }

这里的UA我是使用faker伪造的,当然这也没什么,只是为了让浏览器知道你是一个用户

请求成功后,用xpath来获取ip地址和端口号

将获取到的IP地址和端口号放到列表中

num = int(input('输入爬取页数: '))
        self.ip_list = []
        for page in range(num):
            print(f"==============================正在爬取第{page+1}页==============================")
            self.url = f'http://www.ip3366.net/?stype=1&page={page+1}'
            reqs = requests.get(self.url, headers=self.header)
            reqs.encoding = 'gb2312'
            selecotors = Selector(reqs.text)
            tr_lists = selecotors.xpath('//div[@id="container"]/div[@id="list"]/table[@class="table table-bordered table-striped"]/tbody/tr')
            item = {}
            for tr_list in tr_lists:
                item['ip_dz'] = tr_list.xpath('./td[1]/text()').get()
                item['prots'] = tr_list.xpath('./td[2]/text()').get()
                # item['type_s'] = tr_list.xpath('./td[4]/text()').get()
                self.proxys = {
                    'http': item['ip_dz']+ ':'+item['prots'],
                    'https': item['ip_dz']+ ':'+item['prots']
                }
                print(self.proxys)
                self.ip_list.append(self.proxys)

进行IP验证,设置超时时间为6,超过为不可用,找到可用IP

        can_ip = []
        cant_ip = []
        for ip in self.ip_list:
            try:
                req = requests.get(url='http://httpbin.org/ip', headers=self.headers, proxies=ip, timeout=6)
                # print(req.json())
                req_json = req.json()
                req_ip = req_json.get('origin')
                print('这是origin: ',req_ip)
                http_ip_prot = ip.get('http')
                http_ip = re.search(r'(.*?):(\d+)', http_ip_prot, re.S).group(1)
                print('这是http_ip: ', http_ip)
                try:
                    if req.status_code == 200 and http_ip==req_ip:
                        can_ip.append(ip)
                        print(ip, '可用')
                        print(req.json())
                    else:
                        cant_ip.append(ip)
                        print(ip, '不可用')
                except:
                    cant_ip.append(ip)
                    print(ip, '不可用')
            except:
                cant_ip.append(ip)
                print(ip, '不可用')

将可用IP保存为csv文件

        with open('IP_use.csv', 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(can_ip)

这里展示全部代码:

import requests
# import time
from faker import Faker
from parsel import Selector
import csv
import re




class IPDL():
    def __init__(self):
        self.faker = Faker(locale='zh_CN')
        self.header = {
            'User-Agent': self.faker.chrome(),
            'Cookie':'Hm_lvt_c4dd741ab3585e047d56cf99ebbbe102=1674180209; Hm_lpvt_c4dd741ab3585e047d56cf99ebbbe102=1674184681',
            'Host':'www.ip3366.net',
        }


    def get_html(self):
        num = int(input('输入爬取页数: '))
        self.ip_list = []
        for page in range(num):
            print(f"==============================正在爬取第{page+1}页==============================")
            self.url = f'http://www.ip3366.net/?stype=1&page={page+1}'
            reqs = requests.get(self.url, headers=self.header)
            reqs.encoding = 'gb2312'
            selecotors = Selector(reqs.text)
            tr_lists = selecotors.xpath('//div[@id="container"]/div[@id="list"]/table[@class="table table-bordered table-striped"]/tbody/tr')
            item = {}
            for tr_list in tr_lists:
                item['ip_dz'] = tr_list.xpath('./td[1]/text()').get()
                item['prots'] = tr_list.xpath('./td[2]/text()').get()
                # item['type_s'] = tr_list.xpath('./td[4]/text()').get()
                self.proxys = {
                    'http': item['ip_dz']+ ':'+item['prots'],
                    'https': item['ip_dz']+ ':'+item['prots']
                }
                print(self.proxys)
                self.ip_list.append(self.proxys)

    def train_ip(self):
        """检测ip的函数"""
        self.headers = {
            'User-Agent': self.faker.chrome(),
        }
        can_ip = []
        cant_ip = []
        for ip in self.ip_list:
            try:
                req = requests.get(url='http://httpbin.org/ip', headers=self.headers, proxies=ip, timeout=6)
                # print(req.json())
                req_json = req.json()
                req_ip = req_json.get('origin')
                print('这是origin: ',req_ip)
                http_ip_prot = ip.get('http')
                http_ip = re.search(r'(.*?):(\d+)', http_ip_prot, re.S).group(1)
                print('这是http_ip: ', http_ip)
                try:
                    if req.status_code == 200 and http_ip==req_ip:
                        can_ip.append(ip)
                        print(ip, '可用')
                        print(req.json())
                    else:
                        cant_ip.append(ip)
                        print(ip, '不可用')
                except:
                    cant_ip.append(ip)
                    print(ip, '不可用')
            except:
                cant_ip.append(ip)
                print(ip, '不可用')


        with open('IP_use.csv', 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(can_ip)


        print('可用ip共:',len(can_ip))
        print('不可用ip共:',len(cant_ip))


    def mains(self):
        self.get_html()
        # self.save()
        self.train_ip()

if __name__ == '__main__':
    IPDL().mains()