【python爬虫】爬取ip代理池

最新推荐文章于 2023-11-14 15:01:41 发布

吾有一计

最新推荐文章于 2023-11-14 15:01:41 发布

阅读量1.3k

点赞数

分类专栏： python 文章标签： python 爬虫 tcp/ip

本文链接：https://blog.csdn.net/weixin_52049271/article/details/127175599

版权

python 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

"""
目标：python 爬取ip代理池
所需模块：request bs4 
模块安装：
bs4:  pip install bs4
开发环境：Pycharm  python3.8
爬虫代码必备步骤：
1.确定需求
2.发送请求
3.获取数据
4.解析数据
5.打印数据
"""
import requests
from bs4 import BeautifulSoup
import time
proxies_list = []

def download(response):
    global proxies_list
    data_list = BeautifulSoup(response.text, "lxml")
    tr_list = data_list.select("#freelist > table > tbody")
    tr_list = BeautifulSoup(str(tr_list), "lxml")
    for tr in tr_list.find_all("tr"):
        proxies_dict = {}
        td = BeautifulSoup(str(tr), "lxml")
        try:
            http_type = td.find(attrs={"data-title": "类型"}).text.split(",")[1].strip()
        except:
            http_type = td.find(attrs={"data-title": "类型"}).text
        ip = td.find(attrs={"data-title": "IP"}).text
        port = td.find(attrs={"data-title": "PORT"}).text
        # print(http_type, ip, port)
        proxies_dict[http_type] = ip + ":" + port
        proxies_list.append(proxies_dict)


if __name__ == "__main__":
    for i in range(3):
        url = "https://www.kuaidaili.com/ops/proxylist/" + str(i+1)
        response = requests.get(url=url)
        download(response)
	num=0
    for proxies in proxies_list:
        num+=1
        print(f'NO.{num}.{proxies}')
    print(f'爬取ip代理个数{num}')