python爬取阿里库cnvd

最新推荐文章于 2023-06-30 09:21:18 发布
非衣化十
最新推荐文章于 2023-06-30 09:21:18 发布
阅读量854
点赞数
分类专栏： python 文章标签： python 开发语言
本文链接：https://blog.csdn.net/weixin_44331151/article/details/127245427
版权
python 专栏收录该内容
3 篇文章 0 订阅
订阅专栏
# @Time : 2022/9/28 21:56
# @Author : pzh
# @File : ali_cvd_detail.py
# -*- coding: utf-8 -*-
import re
import requests
from random import randint
import time
from lxml import etree
from cache import memory_cache
import datetime
import threading
from LoggingUtils import logger

def get_onepage_content(url):
    user_agent = ['Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.7113.93 Safari/537.36',
                  'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0',
                  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4482.0 Safari/537.36 Edg/92.0.874.0']
    try:
        response = requests.get(url, headers={'User-Agent':user_agent[randint(0,2)]},timeout=60)
        if response.status_code == 200:
            return response.text
        return
    except Exception:
        return


def show_cve_content(res):
    match = re.compile('<tr>.*?target="_blank">(.*?)</a></td>.*?<td>(.*?)</td>.*?<button.*?>(.*?)</button>.*?nowrap="nowrap">(.*?)</td>' +
                       '.*?<button.*?>(.*?)</button>.*?</tr>', re.S)
    contents = re.findall(match, res)
    for content in contents:
        yield {
            'cve_id'   : content[0].strip(),
            'vul_name' : content[1],
            'cul_type' : content[2].strip(),
            'cve_date' : content[-2].strip(),
            'cvs_level': content[-1].strip()
        }


path ="E://Asiainfo//webappss//pythonDemo//Test//ali_cnvd_";

# 分页数据放到一个文件里面
def save_content_to_text(content,now_time):
    with open(path+now_time+'.txt','a+') as f:
        f.write(content + '\n')


def main():
    #每天凌晨3点执行main
    logger.info("当前任务为："+'120'+"秒执行一次")
    timer = threading.Timer(120, main)
    timer.start()
    #获取任务的页数
    startTime = time.time()
    value = memory_cache.get_value("TotalCNVD")
    logger.info("当前任务获取上次数量为："+str(value))
    #循环的所有数据
    htmls ="";
    #获取最新的页数
    htmlss = get_onepage_content('https://avd.aliyun.com/search?q=cnvd');
    e = etree.HTML(htmlss)
    totalList=e.xpath("//div[@class='d-flex justify-content-between align-items-center']/span[@class='text-muted']/text()")
    splitToal = str(totalList).split("/")
    total = splitToal[1].split('页')
    logger.info("当前最新任务获取页数为："+total[0])
    #判断，进入任务，如果不一样说明有更新的，取最新的减去上次的
    if(str(value) != total[0]):
        #存入这次的total的数量，下次进入时使用
        memory_cache.set_value("TotalCNVD",total[0],86400)
        #如果取出来的值是None说明是第一次进入，取全量
        if(str(value) == 'None') :
            pageNum = int(total[0])
        else:
            pageNum = int(total[0])-int(value)
        #正式需要把2替换成pageNum
        logger.info("需要循环的页数为："+str(pageNum))
        now_time=datetime.datetime.now().strftime('%Y-%m-%d')
        logger.info("当前任务文件路径及名称："+path+now_time+'.txt')
        for pagenum in range(1, 2) :
            url = f'https://avd.aliyun.com/search?q=cnvd&page={pagenum}'
            html = get_onepage_content(url);
            htmls=html+htmls;
        for content in show_cve_content(htmls):
            cveId = content['cve_id'];
            htmldetail = get_onepage_content(f'https://avd.aliyun.com/detail?id={cveId}')
            e = etree.HTML(htmldetail)
            #漏洞描述
            loudong = e.xpath("//div[@class='text-detail pt-2 pb-4']/div/text()")
            #解決建议
            jianyi = e.xpath("//div[@class='text-detail pt-2 pb-4'][2]/text()")
            #参考链接
            cankao = e.xpath("//table[@class='table table-sm table-responsive']/tbody/tr/td/a/@href")
            #攻击路径
            gongji1 = e.xpath("//ul[@class='cvss-breakdown__items']/li[1]/div[@class='cvss-breakdown__desc']/text()")
            #攻击复杂度
            gongji2 = e.xpath("//ul[@class='cvss-breakdown__items']/li[2]/div[@class='cvss-breakdown__desc']/text()")
            #权限要求
            gongji3 = e.xpath("//ul[@class='cvss-breakdown__items']/li[3]/div[@class='cvss-breakdown__desc']/text()")
            #影响范围
            gongji4 = e.xpath("//ul[@class='cvss-breakdown__items']/li[4]/div[@class='cvss-breakdown__desc']/text()")
            #用户交互
            gongji5 = e.xpath("//ul[@class='cvss-breakdown__items']/li[5]/div[@class='cvss-breakdown__desc']/text()")
            #可用性
            gongji6 = e.xpath("//ul[@class='cvss-breakdown__items']/li[6]/div[@class='cvss-breakdown__desc']/text()")
            #保密性
            gongji7 = e.xpath("//ul[@class='cvss-breakdown__items']/li[7]/div[@class='cvss-breakdown__desc']/text()")
            #完整性
            gongji8 = e.xpath("//ul[@class='cvss-breakdown__items']/li[8]/div[@class='cvss-breakdown__desc']/text()")
            content['loudong']=str(loudong).replace("\n", "").replace(" ","")
            content['jianyi']=str(jianyi).replace("\n", "").replace(" ","")
            content['cankao']=str(cankao).replace("\n", "").replace(" ","")
            content['gongji1']=str(gongji1).replace("\n", "").replace(" ","")
            content['gongji2']=str(gongji2).replace("\n", "").replace(" ","")
            content['gongji3']=str(gongji3).replace("\n", "").replace(" ","")
            content['gongji4']=str(gongji4).replace("\n", "").replace(" ","")
            content['gongji5']=str(gongji5).replace("\n", "").replace(" ","")
            content['gongji6']=str(gongji6).replace("\n", "").replace(" ","")
            content['gongji7']=str(gongji7).replace("\n", "").replace(" ","")
            content['gongji8']=str(gongji8).replace("\n", "").replace(" ","")
            save_content_to_text(str(content),now_time)
        endTime = time.time()
        logger.info("cve循环",str(pagenum)+"完成,耗时：",endTime - startTime)
    else: logger.info("无更新")

# 获取现在时间
now_time = datetime.datetime.now()
# 获取明天时间
next_time = now_time + datetime.timedelta(days=+1)
next_year = next_time.date().year
next_month = next_time.date().month
next_day = next_time.date().day
# 获取明天3点时间
next_time = datetime.datetime.strptime(str(next_year)+"-"+str(next_month)+"-"+str(next_day)+" 03:00:00", "%Y-%m-%d %H:%M:%S")
# # 获取昨天时间
# last_time = now_time + datetime.timedelta(days=-1)

# 获取距离明天3点时间，单位为秒
timer_start_time = (next_time - now_time).total_seconds()
logger.info("获取距离明天3点时间（秒）:"+str(timer_start_time))

#定时器,参数为(多少时间后执行，单位为秒，执行的方法)
timer = threading.Timer(timer_start_time, main)
timer.start()

if __name__ == "__main__":
    main()