爬虫代理池

最新推荐文章于 2024-09-09 21:48:30 发布

Xiaoweidumpb

最新推荐文章于 2024-09-09 21:48:30 发布

阅读量228

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/qq_43751489/article/details/109304614

版权

python 专栏收录该内容

23 篇文章 0 订阅

订阅专栏

import requests
import time
from bs4 import BeautifulSoup

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36 Edg/81.0.416.64'
}
ip_dict = {}

#返回请求到的bs4
def get_source(url):
        r = requests.get(url, headers=header)
        #print(r.text)
        if(r.status_code==200):
            r.encoding = 'utf-8'
            soup = BeautifulSoup(r.text, 'html.parser')
            # print(type(soup))
            return soup
        else:
            print("请求网页失败")
            return None

def get_url(URL):  # 翻页
        soup=get_source(URL)
        return soup
        #print('所有IP信息已准备就绪，共 {} 个。'.format(len()))
def get_info(soup):
    items=soup.select('#list > table > tbody> tr')
    for item in items:
        IP   = item.find('td', attrs={'data-title': "IP"}).string
        PORT = item.find('td', attrs={'data-title': "PORT"}).string
        TYPE = item.find('td', attrs={'data-title': "类型"}).string
        ip_dict[IP] = TYPE + '://' + IP + ':' + PORT
def check_ip():
    for IP in list(ip_dict.keys()):
        try:
            r = requests.get('https://www.baidu.com/', proxies={'http': ip_dict[IP]}, headers=header, timeout=0.1)
            #抛出其他的状态码
            r.raise_for_status()
        except:
            del ip_dict[IP]
    print('已对所有IP进行检测，延时不超过 0.1 秒的有 {} 个。'.format(len(ip_dict)))

def save_ip():
    f = open("ip.txt", 'a')
    # 若文件不存在，系统自动创建。'a'表示可连续写入到文件，保留原内容，在原
    # 内容之后写入。可修改该模式（'w+','w','wb'等）
    for IP in list(ip_dict.keys()):
        print(ip_dict[IP])

        f.write(ip_dict[IP])  # 写入文件中
        f.write("\t")  # 换行

        #
        # if __name__ == '__main__':
        #     fw = open("/exercise1/data/query_deal.txt", 'w')  # 将要输出保存的文件地址
        #     for line in open("/exercise1/data/query.txt"):  # 读取的文件
        #         fw.write("\"poiName\":\"" + line.rstrip("\n") + "\"")  # 将字符串写入文件中
        #         # line.rstrip("\n")为去除行尾换行符
        #         fw.write("\n")  # 换行
    f.close()
if __name__ == '__main__':
    url = 'https://www.kuaidaili.com/free/'
    for page_num in range(1, 2):
        URL = url + 'inha/{}/'.format(page_num)
        print(URL)
        time.sleep(1)
        get_info(get_url(URL))
    check_ip()
    save_ip()