Python定时爬取Jenkins信息写入文件，Java读取文件数据并入库

农夫码仔

于 2024-03-08 17:22:53 发布

阅读量366

点赞数 8

文章标签： python jenkins java

本文链接：https://blog.csdn.net/weixin_35376756/article/details/136566849

版权

1、cas登录Jenkins

# -*- coding: utf-8 -*-
# @Time    : 2023/5/23
# @Author  : Pnny
# @desc    : cas认证
import time
import requests
import os
import pickle
from bs4 import BeautifulSoup


class CasService(object):

    def __init__(self):
        self.session = requests.session()  # cas session

    # 当前Jenkins登录采用CAS统一认证方式
    def login(self, service_url, user_name, pass_word, write_path, fileName):
        # 请求Jenkins通过cas重定向后的第一个url
        response = self.session.get(url=service_url, allow_redirects=False)
        # print(f"{service_url}:{response.status_code}")
        if response.status_code == 200:
            return True
        # 从location中获取到下一个需要请求的url
        cas_url = response.headers["Location"]
        # print(cas_url)
        # 发起请求，如果报403，请检查服务器上hosts文件中cas的域名映射是否正常，cas的ip地址在浏览器中可获取
        cas_response = self.session.get(cas_url, allow_redirects=False)
        # print(f"{cas_url}:{cas_response.status_code}")
        if cas_response.status_code == 200:
            # 通过bs4获取登录页面信息
            login_html = BeautifulSoup(cas_response.text, 'lxml')
            # 通过select方法拿到execution信息，该信息在登录认证时需要
            execution_value = login_html.select('#fm1 > input[name=execution]')[0]['value']
            # 组装认证信息
            auth_data = {
                "_eventId": "submit",
                "execution": execution_value,
                "username": user_name,
                "password": pass_word,
            }
            # 发起认证请求
            auth_response = self.session.post(cas_url, data=auth_data, allow_redirects=False)
            # print(f"{cas_url}:{auth_response.status_code}")
            # 认证请求成后依然返回302，然后重定向回Jenkins的地址
            if auth_response.status_code == 302:
                # 拿到Jenkins的url
                url_with_ticket = auth_response.headers["location"]
                # 访问最终的url
                confirm_response = self.session.get(url=url_with_ticket, allow_redirects=True)
                # print(f"{url_with_ticket}:{confirm_response.status_code}")
                if confirm_response.status_code == 200:
                    info_log("login Jenkins success !!! ")
                    # 将登录成功后的cookie写入文件
                    self.write_cas_cookies_to_file(write_path, fileName)
                else:
                    error_log("login in Jenkins failed !!! ")
            else:
                error_log(f"auth failed,url:{cas_url}")

    # cookie写入文件
    def write_cas_cookies_to_file(self, write_path, fileName):
        # 如果cookie文件存在，删除
        if os.path.exists(f"{write_path}{fileName}"):
            os.remove(f"{write_path}{fileName}")
        # 重新写入cookie文件
        with open(f"{write_path}{fileName}", 'wb') as f:
            # print(self.session.cookies)
            pickle.dump(self.session.cookies, f)


def info_log(msg):
    print(f"\033[0;32;40m{now_to_date()}:[INFO]:{msg}\033[0m")


def error_log(msg):
    print(f"\033[0;31;40m{now_to_date()}:[ERROR]:{msg}\033[0m")


def now_to_date(format_string="%Y-%m-%d %H:%M:%S"):
    time_stamp = int(time.time())
    time_array = time.localtime(time_stamp)
    str_date = time.strftime(format_string, time_array)
    return str_date

    # # 读取cookie文件
    # def load_cas_cookies_from_file(self):
    #     if os.path.exists("cas_cookies.dat"):
    #         with open("cas_cookies.dat", 'rb') as f:
    #             self.session.cookies.update(pickle.load(f))

# if __name__ == '__main__':
# casService = CasService()
# url = "https://jenkins.xxx.com/securityRealm/commenceLogin?from=%2F"
# userName = "xxx"
# password = "xxx"
# write_path = "/home/xxx/pytest/JKProjects/"
# casService.login(url, userName, password, write_path)
# casService.write_cas_cookies_to_file()

2、爬取Jenkins上所需信息，并写入文件中

# -*- coding: utf-8 -*-
# @Time    : 2023/5/11
# @Author  : pengyong
# @desc    : 爬取Jenkins目录结构
import pickle
import time
import requests
import os
import shutil
from bs4 import BeautifulSoup
from cas_login import CasService


class Crawler(object):

    def __init__(self, url, path, fileName):
        self.cookie = None
        self.response = None
        self.first_url = url
        self.write_path = path
        self.class_name_table = "jenkins-table  sortable"
        self.class_name_pane = "sortable pane bigtable"
        self.class_name_td = "jenkins-table__link model-link inside"
        self.project_index = "project.index"
        self.owner_flag = "责任人"
        self.session = requests.session()
        self.cookie_file_name = fileName
        self.description_file = "descriptions/"
        self.build_result_file = "lastBuildResult/"
        self.build_message = "allLastBuildMessage"
        self.headers = {
            "Accept": "text/html, application/xhtml+xml, application/xml; q=0.9, */*; q=0.8",
            "Accept-Language": "zh_CN",
            "Connection": "keep-alive",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363",
        }

    # 读取cookie
    def load_cas_cookies_from_file(self):
        if os.path.exists(f"{write_path}{self.cookie_file_name}"):
            with open(f"{write_path}{self.cookie_file_name}", 'rb') as f:
                self.session.cookies.update(pickle.load(f))

    # 开始爬取Jenkins
    def get_views(self):
        # 登录首页，拿到响应结果
        response = self.session.post(self.first_url, headers=self.headers)
        if response.status_code != 200:
            error_log(f"error: login failed!!!")
            return
        html_doc = response.text
        soup = BeautifulSoup(html_doc, "html.parser")
        views = soup.find_all("div", class_="tab")
        # print(views)
        # 写入文件前清理历史文件
        delete_when_is_exist(self.write_path, self.project_index)
        # 查看上次构建结果目录是否存在，存在则清理
        if os.path.exists(self.write_path + self.build_result_file):
            os.chdir(self.write_path)
            shutil.rmtree(self.build_result_file)
        # 遍历查找结果
        for view in views:
            link = view.find("a")
            info_log("版本: " + link["href"])
            # print(link["href"], link.get_text())
            if link["href"] == "/view/all/":
                # 如果view是“所有”，则不获取分类的数据
                continue
            # print(f"project:{link.get_text()}")
            # 项目列表写入文件
            writeFile(self.write_path, self.project_index, link.get_text())
            # 将一级目录写入指定目录的文件中（1个项目的信息存储在一个文件中）
            delete_when_is_exist(self.write_path, link.get_text())
            writeFile(self.write_path, link.get_text(), f"project:{link.get_text()}")
            # 拼接view的url
            url = self.first_url + link["href"]
            # 发起请求
            response = self.session.get(url)
            html_doc = response.text
            soup = BeautifulSoup(html_doc, "html.parser")
            table_list = soup.find("table", id="projectstatus", class_=self.class_name_pane)
            # print(table_list)
            # 遍历各项目页签下的第一级列表
            if table_list is None:
                # 一级目录不是目录时，直接获取job相关信息并保存
                tds = soup.findAll("a", class_=self.class_name_td)
                for td in tds:
                    if td is not None:
                        print("工程：" + td.get_text())
                        tds1 = soup.find("tr", id=f"job_{td.get_text()}").findAll("td")
                        job_url = td["href"]
                        # 将二级目录为job的信息写入文件中：父目录 job名称 job的url
                        writeFile(self.write_path, link.get_text(),
                                  f"job:{link.get_text()} {td.get_text()} {url + job_url}")
                        # description = tds1[3].contents[0]
                        description_td = tds1[3]
                        if len(description_td.contents) != 0:
                            description = description_td.contents[0]
                        else:
                            description = ""
                        # 将job的描述信息写入文件，因为job的描述信息无固定规则，所以单独写入一个文件
                        writeDescriptionToFile(self, td, description)
                        # 构建结果写入文件
                        # getTheLastBuildResults(self, url + job_url, td)
            else:
                # 一级目录是文件时，继续遍历
                cells = table_list.findAll("a")
                for cell in cells:
                    link_url = cell["href"]
                    link_text = cell.get_text()
                    if link_url == "#":
                        # 排除不需要的列
                        continue
                    print("目录: " + link_text)
                    # 拼接一级目录的url
                    url = self.first_url + link_url
                    # 点击一级目录，进入到二级目录中
                    response = self.session.get(url)
                    html_doc = response.text
                    soup = BeautifulSoup(html_doc, "html.parser")

                    # 将一级目录为文件的信息写入文件中
                    view_description = soup.find("div", id="description").find("div")
                    # print(view_description.text)
                    # 写入二级目录的描述信息到指定文件
                    delete_when_is_exist(self.write_path + self.description_file, link_text)
                    writeFile(self.write_path + self.description_file, link_text, view_description.text)

                    tds = soup.findAll("a", class_=self.class_name_td)
                    if tds is None:
                        continue
                    # 遍历二级目录中的任务信息
                    count = 0
                    for td in tds:
                        count = count + 1
                        if td is not None:
                            print("工程：" + td.get_text())
                            tds1 = soup.find("tr", id=f"job_{td.get_text()}").findAll("td")
                            job_url = td["href"]
                            # 将二级目录为job的信息写入文件中：父目录 job名称 job的url
                            writeFile(self.write_path, link.get_text(),
                                      f"job:{link_text} {td.get_text()} {url + job_url}")
                            # description = tds1[3].contents[0]
                            description_td = tds1[3]
                            if len(description_td.contents) != 0:
                                description = description_td.contents[0]
                            else:
                                description = ""
                            # 将job的描述信息写入文件，因为job的描述信息无固定规则，所以单独写入一个文件
                            writeDescriptionToFile(self, td, description)
                            # 构建结果写入文件
                            # getTheLastBuildResults(self, url + job_url, td)

                    # 写入二级目录的数量信息到项目文件中
                    writeFile(self.write_path, link.get_text(), f"path:{link_text} {count}")


# 将job的描述信息写入文件，因为job的描述信息无固定规则，所以单独写入一个文件
def writeDescriptionToFile(self, td, description):
    # 判断是否已经存在该目录
    if not os.path.exists(self.write_path + self.description_file):
        # 目录不存在，进行创建操作,使用os.makedirs()方法创建多层目录
        os.makedirs(self.write_path + self.description_file)
    delete_when_is_exist(self.write_path + self.description_file, td.get_text())
    writeFile(self.write_path + self.description_file, td.get_text(), description)


# 最近一次构建结果写入文件
def getTheLastBuildResults(self, url, td):
    # 进入job任务详情，获取最近一次的构建结果信息
    job_detail_response = self.session.get(url)
    job_html_doc = job_detail_response.text
    # print(job_html_doc)
    job_soup = BeautifulSoup(job_html_doc, "html.parser")
    build_history = job_soup.find("a", class_="build-status-link")
    print(build_history)
    if build_history is not None:
        last_build_status = build_history.findAll("use")[0]["href"].split("#")[1]
        if last_build_status is None:
            last_build_result = ""
        elif last_build_status == "build-status-in-progress":
            last_build_result = "last-progress"
        else:
            last_build_result = build_history.findAll("use")[1]["href"].split("#")[1]
        last_build_number = job_soup.find("a", class_="model-link inside build-link display-name").text
        last_build_time = job_soup.find("a", class_="model-link inside build-link").text
        last_build_message = td.get_text() + " " + last_build_number + " " + last_build_result + " " + last_build_time
        # print(last_build_message)
        if not os.path.exists(self.write_path + self.build_result_file):
            # 使用os.makedirs()方法创建多层目录
            os.makedirs(self.write_path + self.build_result_file)
        writeFile(self.write_path + self.build_result_file, self.build_message, last_build_message)


# Jenkins信息写入文件
def writeFile(file_path, file_name, content):
    # print(f"{file_path}{file_name}")
    with open(f"{file_path}{file_name}", 'a+', encoding='utf-8') as f:
        f.write(f"{content}\n")


# 相同文件存在时先删除
def delete_when_is_exist(file_path, file_name):
    if os.path.exists(f"{file_path}{file_name}"):
        os.remove(f"{file_path}{file_name}")


def info_log(msg):
    print(f"\033[0;32;40m{now_to_date()}:[INFO]:{msg}\033[0m")


def error_log(msg):
    print(f"\033[0;31;40m{now_to_date()}:[ERROR]:{msg}\033[0m")


def now_to_date(format_string="%Y-%m-%d %H:%M:%S"):
    time_stamp = int(time.time())
    time_array = time.localtime(time_stamp)
    str_date = time.strftime(format_string, time_array)
    return str_date


if __name__ == '__main__':
    info_log("***Start crawling,please wait......***")
    run_time_start = time.time()

    firstUrl = "https://jenkins.xxx.com"
    loginUrl = "https://jenkins.xxx.com/securityRealm/commenceLogin?from=%2F"
    userName = "xxx"
    password = "xxx"
    current_dir = os.getcwd()
    parent_dir = os.path.dirname(current_dir)
    write_path = parent_dir + "/JKProjects/"
    cookie_file_name = "cas_cookies.dat"

    # 初始化登录
    casService = CasService()
    # 调用登录函数
    casService.login(loginUrl, userName, password, write_path, cookie_file_name)
    # 初始化获取Jenkins信息脚本
    craw = Crawler(firstUrl, write_path, cookie_file_name)
    # 加载cookie
    craw.load_cas_cookies_from_file()
    # 开始爬取信息
    craw.get_views()

    run_time_end = time.time()
    run_time = (run_time_end - run_time_start)
    info_log("***End of crawling***")
    info_log(f"running time is：{run_time} s")

写入文件效果：

3、Java中定时触发爬取；并根据写入文件的数据格式，通过命令获取文件中的所需信息，然后入库

package com.sics.testplatform.common;

import com.sics.testplatform.controller.build.analysis.IBuildAnalysis;
import com.sics.testplatform.controller.build.broadcast.IBuildBroadcast;
import com.sics.testplatform.mapper.build.analysis.ScheduledMapper;
import com.sics.testplatform.service.run.RunRecordsServiceImpl;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.SchedulingConfigurer;
import org.springframework.scheduling.config.ScheduledTaskRegistrar;
import org.springframework.scheduling.support.CronTrigger;
import org.springframework.stereotype.Component;

/**
 * Jenkins同步 -定时任务
 *
 * @author pengyong
 * @create 2023/05/24
 */
@Component
@EnableScheduling
@Slf4j
public class CronTaskConfig implements SchedulingConfigurer {

    @Autowired
    ScheduledMapper scheduledMapper;
    @Autowired
    IBuildAnalysis iBuildAnalysis;
    @Autowired
    IBuildBroadcast iBuildBroadcast;
    @Autowired
    RunRecordsServiceImpl runRecordsService;

    @Override
    public void configureTasks(ScheduledTaskRegistrar scheduledTaskRegistrar) {

        scheduledTaskRegistrar.addTriggerTask(this::process,
                triggerContext -> {
                    String cron = scheduledMapper.selectCronById("2");
                    if (cron == null || cron.isEmpty() ){
                        log.error("数据库:SCHEDULED表中定时任务配置信息不存在");
                        return null;
                    } else {
                        return new CronTrigger(cron).nextExecutionTime(triggerContext);
                    }
                });
 
    }

    private void process() {
        log.info("******* process1 : cron task is running *******");
        iBuildAnalysis.syncJenkinsAllDirectory();
    }

}

/**
     * 同步Jenkins中目录结构，并入库
     * 目录结果由爬虫脚本爬取后存到指定目录，同步时执行如下步骤：
     * 1、调用python爬虫脚本，爬取Jenkins目录结构
     * 2、执行命令，获取目录结果文件内容，并返回给java程序
     * 3、java程序拿到结果后，进行入库
     *
     * @return 同步结果
     */
    @Override
    public HttpFormatResult syncJenkinsAll() {
        if (isJenkinsSync) {
            return new HttpFormatResult(HttpReturnMsg.FAIL_CODE, "同步任务正在执行中或其他人正在执行该任务，请稍后重试");
        }
        isJenkinsSync = true;
        //开启线程池去执行
        ExecutorService service = Executors.newSingleThreadExecutor();
        service.execute(() -> {
            log.info("****** 开始同步Jenkins信息 *****");
            try {
                // 执行前清理历史数据
                YTPUtils.run("cd " + BuildConstant.BASE_PATH + "; rm -rf " + BuildConstant.SYNC_RESULT_PATH);
                YTPUtils.run("cd " + BuildConstant.BASE_PATH + "; mkdir -p " + BuildConstant.SYNC_RESULT_PATH + BuildConstant.DESCRIPTION_RESULT_PATH);
                //执行爬虫脚本
                YTPUtils.execute("cd " + BuildConstant.BASE_PATH + BuildConstant.SCRIPT_FILE + "; "
                        + BuildConstant.PYTHON3_CMD + BuildConstant.SCRIPT_NAME);
                //获取爬虫结果并返回
                String getJenkinsMsg = BuildConstant.CAT_CMD + BuildConstant.BASE_PATH + BuildConstant.SYNC_RESULT_PATH + BuildConstant.PROJECT_INDEX;
                List<String> projectList = YTPUtils.executeCMD(getJenkinsMsg);
//            log.info(String.valueOf(projectList));
                //一级目录
                firstDirStorage(projectList);
                //二级目录，如果二级为任务，在任务信息方法中处理
                secondDirOrJobStorage(projectList);
                //任务信息列表
                jobMessageStorage(projectList);
                isJenkinsSync = false;
            } catch (Exception e) {
                isJenkinsSync = false;
                throw new RuntimeException(e.getMessage());
            }
        });

    }

执行shell方法

public static String run(String command) {
        Scanner input = null;
        StringBuilder result = new StringBuilder();
        Process process = null;
        List<String> commandArr = new ArrayList<>();
        commandArr.add("/bin/sh");
        commandArr.add("-c");
        commandArr.add(command);
        log.info(">>> cmd: " + command);
        try {
            process = Runtime.getRuntime().exec(commandArr.toArray(new String[commandArr.size()]));
            process.waitFor(10, TimeUnit.SECONDS);
            InputStream is = process.getInputStream();
            input = new Scanner(is);
            while (input.hasNextLine()) {
                String line = input.nextLine() + "\n";
                log.info(">>> line:" + line);
                result.append(line);
            }
        } catch (Exception e) {
            log.error("error:" + e.getMessage());
            e.printStackTrace();
        } finally {
            if (input != null) {
                input.close();
            }
            if (process != null) {
                process.destroy();
            }
        }
        return result.toString().trim();
    }