python爬虫之代理池的获取和可用性检测

python爬虫之代理池的获取和可用性检测

0x01 简介

本文主要介绍免费代理IP的爬取和检测,主要使用beautifulsoup解析网页信息,摘取免费的代理ip和端口,爬取信息来源为公开的免费的代理池

0x02 代码

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/3/21/021 22:09
# @Author : H
# @File : getproxy.py

import requests
from bs4 import BeautifulSoup
import re


def getProxyIP_61(page):
    base_url = f"http://www.66ip.cn/{page}.html"
    res = requests.get(base_url)
    sub = []
    if res.ok:
        soup = BeautifulSoup(res.content, "html.parser")
        table = soup.find_all('table')[2]
        tr = table.find_all("tr")
        for td in tr:
            ip = td("td")[0].string
            port = td("td")[1].string
            if ip == "ip":
                pass
            else:
                sub.append(ip + ":" + port)
    return sub


def getProxyIP_61_areaindex_1(page):
    base_url = f"http://www.66ip.cn/{page}.html"
    res = requests.get(base_url)
    sub = []
    if res.ok:
        soup = BeautifulSoup(res.content, "html.parser")
        table = soup.find_all('table')[2]
        tr = table.find_all("tr")
        for td in tr:
            ip = td("td")[0].string
            port = td("td")[1].string
            if ip == "ip":
                pass
            else:
                sub.append(ip + ":" + port)
    return sub


def getProxyIP_xicaidaili(page):
    base_url = f"https://ip.jiangxianli.com/?page={page}"
    res = requests.get(base_url)
    sub = []
    if res.ok:
        soup = BeautifulSoup(res.content, "html.parser")
        table = soup("table")[0]
        tr = table.find_all("tr")
        for i in tr:
            if len(i("td")) != 0:
                ip = i("td")[0].string
                port = i("td")[1].string
                sub.append(ip + ":" + port)
    return sub


def getIPproxy_ihuan(page):
    # page = b97827cc 1
    # page = 4ce63706 2
    # page = 5crfe930 3
    # page = f3k1d581 4
    base_url = f"https://ip.ihuan.me/?page={page}"
    res = requests.get(base_url)
    sub = []
    if res.ok:
        soup = BeautifulSoup(res.content, "html.parser")
        table = soup.find_all('tbody')[0]
        tr = table.find_all("tr")
        for td in tr:
            ip = td("td")[0]("a")[0]
            ip = re.findall('(.*)>(.*)</a>(.*)', str(ip))[0][1]
            port = td("td")[1].string
            sub.append(ip + ":" + port)
    return sub


def huizong(sub, res):
    for i in res:
        sub.append(i)
    return sub


if __name__ == '__main__':
    # 获取代理IP和端口号
    pages = ['b97827cc', '4ce63706', '5crfe930']
    sub = []
    for page in pages:
        huizong(sub, getIPproxy_ihuan(page))
    for i in range(1, 3):
        huizong(sub, getProxyIP_61(i))
        huizong(sub, getProxyIP_61_areaindex_1(i))
        huizong(sub, getProxyIP_xicaidaili(i))


    # 检测代理可用性
    url = "https://www.baidu.com"
    for ip in sub:
        try:
            proxy_host = "https://" + ip
            proxies = {"http": proxy_host}
            res = requests.get(url, proxies=proxies)
            if res.ok:
                print("可用代理:\t" + proxy_host)
        except Exception as e:
            sub.remove(ip)
            continue

0x03 结果

在这里插入图片描述

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值