python3批量抓取免费代理ip

最新推荐文章于 2024-06-26 17:58:58 发布

igsove

最新推荐文章于 2024-06-26 17:58:58 发布

阅读量624

点赞数

文章标签： python

本文链接：https://blog.csdn.net/igsove/article/details/104499274

版权

批量抓取免费代理ip

目标网址:https://www.freeip.top/?page=1
工具:谷歌浏览器,pycharm,python3.8
这里没写验证ip是否存活,这里可以get百度之类的大型网站,看状态码为200就为可用

import requests
import re
import time
import random
from bs4 import BeautifulSoup
#获取代理ip和端口  https://www.freeip.top/?page=1

#文件保存地址
txtPath ="AgencyIP.txt"
def getHTML(url):
    try:
        time.sleep(5)
        r = requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text

    except:
        print(r.status_code)
        return ""
#重新获取ip 并保存
def RestGetIP():
    url = 'https://www.freeip.top/?page='
    with open(txtPath, "w") as file:  # 只需要将之前的”w"改为“a"即可，代表追加内容
        file.write("")
    file.close()
    for i in range(1, 10):
        nowUrl = url + str(i)

        print(nowUrl)

        html = getHTML(nowUrl)
        soup = BeautifulSoup(html, 'lxml')
        table = soup.find('body').find('table')
        # print(str(table))
        ip_reg = re.compile('ip="(.*?)"')
        ip_arry = ip_reg.findall(str(table))  # h获取ip  list

        port_reg = re.compile('<td>(\d*)</td>')
        port_array = port_reg.findall(str(table))  # 获取ip对应的端口号
        # print(port_array)
        print(ip_arry)
        for i, ip in enumerate(ip_arry):
            # AgencyIP.txt
            with open("AgencyIP.txt", "a") as ipfile:  # 只需要将之前的”w"改为“a"即可，代表追加内容
                s = str(ip) + ':' + port_array[i] + "\n"
                print(s)
                ipfile.write(s)
        ipfile.close()

#读取文件
def redDataTolist():
    f = open(txtPath, "r")
    data =[]
    for line in f:  # 设置文件对象并读取每一行文件
         line = line[:-1]
         dic ={'http': line}

         data.append(dic)
    f.close()
    # print(data)
    return data


if __name__ == '__main__':
   RestGetIP()
   procxy = redDataTolist()
   print(procxy)