python 爬取kuaidaili ops-提取数据

最新推荐文章于 2022-06-23 12:49:36 发布

清风冷吟

最新推荐文章于 2022-06-23 12:49:36 发布

阅读量602

点赞数

分类专栏： Python相关文章标签： python3 快代理提取代理地址

本文链接：https://blog.csdn.net/weixin_43968923/article/details/86373781

版权

Python相关专栏收录该内容

30 篇文章 0 订阅

订阅专栏

python 爬取kuaidaili ops页-提取数据

爬取快代理，开放代理页

爬取快代理，开放代理页

提取相关的数据
导入excel表（后期直接写入数据库）

import os
import re
import time
import openpyxl
import requests
from bs4 import BeautifulSoup


class KuaiDaiLi(object):
    def __init__(self):
        self.session = requests.session()
        self.excel = None
        self.headers = {
            "Connection": "keep-alive",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
                          " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.10 Safari/537.36"
        }
        self.count = 3  # 获取页面深度；默认3页
        self.time = 2  # 获取下一页间隔时间

    # 获取请求状态
    def get_status(self, url):
        response = self.session.get(url, headers=self.headers)
        if response.status_code == 200:
            return response
        else:
            print("网络连接失败！")
            return None

    # 访问首页，建立连接
    def index(self, url):
        response = self.get_status(url)
        if not response:
            return False
        return True

    # 初始化 excel
    def initialize_excel(self, path):
        if os.path.exists(path):
            self.excel = openpyxl.load_workbook(path)
        else:
            self.excel = openpyxl.Workbook()

            headline_data = {
                "IP": ["PORT", "匿名度", "类型", "get/post支持", "位置", "响应速度	", "最后验证时间"],
            }
            self.write_to_excel(path, headline_data)

    # 写入excel文件
    def write_to_excel(self, path, info_dic):
        # 获得所有sheet的名称
        sheet_names = self.excel.sheetnames
        # 根据sheet名字获得sheet
        # sheet = read_excel["Sheet1"]
        # 根据获取第一个sheet对象
        sheet = self.excel[sheet_names[0]]

        for ip in info_dic:
            # print(ip, info_dic[ip])
            sheet.append((ip,
                          info_dic[ip][0],
                          info_dic[ip][1],
                          info_dic[ip][2],
                          info_dic[ip][3],
                          info_dic[ip][4],
                          info_dic[ip][5],
                          info_dic[ip][6]))

        self.excel.save(path)

    # 页数解析
    def page_parse(self, url):
        response = self.get_status(url)
        if not response:
            return None
        html = response.text
        soup = BeautifulSoup(html, "html5lib")
        pages = soup.select("#listnav > ul > li > a")
        for page in pages:
            # title = page.text
            href = page.get("href")
            get_url = re.findall(r"(https://.*?)/", url)
            url = get_url[0] + href
            yield url

    # 页面解析
    def parse(self, url):
        info_dic = {}
        print(url)
        response = self.get_status(url)
        if not response:
            return None
        html = response.text
        soup = BeautifulSoup(html, "html5lib")
        trs = soup.select("#freelist tbody > tr")
        for tr in trs:
            tds = tr.find_all("td")
            ip = port = hidden = ip_type = get_post_support = location = speed = last_verification_time = ""
            for i in range(len(tds)):
                # "IP": ["PORT", "匿名度", "类型", "get/post支持", "位置", "响应速度	", "最后验证时间"],
                ip = tds[0].text
                port = tds[1].text
                hidden = tds[2].text
                ip_type = tds[3].text
                get_post_support = tds[4].text
                location = tds[5].text
                speed = tds[6].text
                last_verification_time = tds[7].text
            if not ip:
                continue
            print(ip, port, hidden, ip_type, get_post_support, location, speed, last_verification_time)
            info_dic[ip] = [port, hidden, ip_type, get_post_support, location, speed, last_verification_time]

        # print(info_dic)
        # print(len(info_dic))
        return info_dic

    def main(self):
        # 首页
        url = "https://www.kuaidaili.com"
        self.index(url)

        # 初始化 ecxel
        path = os.path.abspath(os.path.join(os.getcwd(), "代理IP信息.xlsx"))
        self.initialize_excel(path)

        # 页数解析
        url = "https://www.kuaidaili.com/ops/"
        urls = self.page_parse(url)

        # 翻页
        for url in urls:
            info_dic = self.parse(url)  # 解析页面
            self.write_to_excel(path, info_dic)  # 写入excel
            time.sleep(self.time)  # 请求间隔时间

        self.excel.close()  # 关闭excel


if __name__ == '__main__':
    kdl = KuaiDaiLi()
    kdl.main()

清风冷吟

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python 爬取kuaidaili ops-提取数据

python 爬取kuaidaili ops页-提取数据爬取快代理，开放代理页爬取快代理，开放代理页提取相关的数据导入excel表（后期直接写入数据库）import osimport reimport timeimport openpyxlimport requestsfrom bs4 import BeautifulSoupclass KuaiDaiLi(object...
复制链接

扫一扫