利用爬虫构建IP代理池

最新推荐文章于 2024-07-19 17:57:00 发布

yubaibusan

最新推荐文章于 2024-07-19 17:57:00 发布

阅读量474

点赞数

分类专栏：爬虫学习文章标签： IP代理池

本文链接：https://blog.csdn.net/qq_42563386/article/details/100905061

版权

爬虫学习专栏收录该内容

2 篇文章 0 订阅

订阅专栏

本文介绍了如何利用爬虫来构建一个IP代理池，首先进行必要的准备工作，包括安装所需的包。虽然免费的代理IP稳定性不佳，仅适合偶尔测试，但通过爬虫获取和维护IP代理池可以提供稳定的代理服务。

摘要由CSDN通过智能技术生成

准备工作, 安装好我们所要使用到的包.

# 用于连接MongoDB数据库的包,要事先安装好MongoDB数据库
pip install pymongo
# 用于发送请求的包
pip install requests
# 用于解析HTML文档,提取我们想要的内容
pip install lxml

源代码(注:免费的代理ip通常不够稳定,偶尔测试使用一下还可以,但常用代理ip还是付费的比较稳定):

import threading
import pymongo
import requests
from lxml import html

url = 'https://www.xicidaili.com/nn/1'

headers = {
    'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
}
etree = html.etree

response = requests.get(url=url, headers=headers)

html = etree.HTML(response.text)
# ip
ip = html.xpath("//tr[@class='odd']/td[2]/text()")
# 端口号
port = html.xpath("//tr[@class='odd']/td[3]/text()")
# 隐匿性
anon = html.xpath("//tr[@class='odd']/td[5]/text()")
# 是http还是https协议
prot = html.xpath("//tr[@class='odd']/td[6]/text()")


# 创建一个列表保存http类型的ip
http_list = []
# 创建一个列表保存https类型的ip
https_list = []
# 创建一个列表保存能使用的ip
goodip_list = []


for index, value in enumerate(anon):
    if value == '高匿':
        ip_port = ip[index] + ':' + port[index]
        if prot[index] == 'http':
            http_list.append(ip_port)
        else:
            https_list.append(ip_port)


# 测试得到的ip是否可用
def ip_test(proxy):
    try:
        urls = r'http://ip.tool.chinaz.com/'

        result = requests.get(
            url=urls,
            headers=headers,
            proxies={
                'http': f'http://{proxy}',
                'https': f'https://{proxy}',
            },
            timeout=5,
        )
        # 得到对应的页面内容
        htmls = etree.HTML(result.text)
        # 返回一个列表
        ip_get = htmls.xpath(r'//dd[@class="fz24"]/text()')

        if ip_get[0] == proxy[:-5]:
            return True
        elif ip_get[0] == proxy[:-6]:
            return True
        else:
            return False
    except:
        return False


def save_ip(proxy, proxy_type):
    if ip_test(proxy):
        print(f'可使用ip:{proxy}, 类型为:{proxy_type}')
        goodip_list.append({'ip': proxy})


if __name__ == '__main__':
    tasks = []  # 线程池
    for i in http_list:
        task = threading.Thread(target=save_ip, args=(i, 'http'))
        tasks.append(task)
        task.start()
    for j in https_list:
        task = threading.Thread(target=save_ip, args=(j, 'https'))
        tasks.append(task)
        task.start()
    for k in tasks:
        k.join()
    # 创建mongodb数据库连接
    conn = pymongo.MongoClient()
    # 选择数据库
    db = conn.proxy
    # 查询数据库中存在的集合
    coll_list = db.list_collection_names()
    if 'proxys' in coll_list:
        # 如果存在则清空集合中的所有数据
        proxys = db.proxys
        proxys.delete_many({})
    else:
        # 如果不存在则创建集合proxys
        proxys = db.proxys
    # 往集合proxys中添加数据
    proxys.insert_many(goodip_list)

    print('已完成所有操作!')