建立代理IP池

爬取高匿代理IP并测试可用性,保存为.csv格式

import requests
from bs4 import BeautifulSoup
import lxml
import time


def get_IP(num):
    '''
    爬取代理IP
    '''   
    IP_list = []
    header = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }
    for i in range(1, num+1):
        print(i)
        url = f'https://www.kuaidaili.com/free/inha/{i}/'
        time.sleep(1)
        req = requests.get(url, headers=header)
        soup = BeautifulSoup(req.text, 'lxml')
        tr_list = soup.find_all('tr')[1:]
        
        for tr in tr_list:
            td_list = tr.find_all('td')
            ip = td_list[0].text
            port = td_list[1].text
            IP_list.append(f'{ip}:{port}')
    return IP_list


def test_IP(IP_list):
    '''
    检验IP可用性
    ''' 
    IP_pool = []
    url = 'http://www.httpbin.org/ip'
    header = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }
    for ip in IP_list:
        try:
            pro = {'http': f'http://{ip}', 
                'https': f'http://{ip}'}
            req = requests.get(url, headers=header, proxies=pro, timeout=5)
            time.sleep(0.5)
        
            if req.status_code == 200 and req.json()['origin']==ip:
                IP_pool.append(ip)
        except:
            continue
    return IP_pool


if __name__ == '__main__':
    import csv
    
    IP_list = get_IP(3)
    IP_pool = test_IP(IP_list)
    
    with open('proxy_ip.csv', 'w', newline='') as f:
        csv.writer(f).writerow(['ip'])
        
        for ip in IP_pool:
            csv.writer(f).writerow([ip])  #writerow()里是一个可迭代对象

    print('IP池建立完毕')

测试代理IP是否成功

import pandas as pd
import numpy as np
import requests
import json


url = 'http://www.httpbin.org/get'
ips = pd.read_csv('proxy_ip.csv').iloc[:,0]

for ip in ips:
    pro = {'http': f'http://{ip}', 
           'https': f'http://{ip}'}
    try:
        r = requests.get(url, proxies=pro, timeout=10).text
        r_dict = json.loads(r)
        print('代理IP:', pro)
        print('实际IP:', r_dict['origin'])
    except:
        continue

在这里插入图片描述

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值