# @Time : 2022/9/28 21:56
# @Author : pzh
# @File : ali_cvd_detail.py
# -*- coding: utf-8 -*-
import re
import requests
from random import randint
import time
from lxml import etree
from cache import memory_cache
import datetime
import threading
from LoggingUtils import logger
def get_onepage_content(url):
user_agent = ['Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.7113.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4482.0 Safari/537.36 Edg/92.0.874.0']
try:
response = requests.get(url, headers={'User-Agent':user_agent[randint(0,2)]},timeout=60)
if response.status_code == 200:
return response.text
return
except Exception:
return
def show_cve_content(res):
match = re.compile('<tr>.*?target="_blank">(.*?)</a></td>.*?<td>(.*?)</td>.*?<button.*?>(.*?)</button>.*?nowrap="nowrap">(.*?)</td>' +
'.*?<button.*?>(.*?)</button>.*?</tr>', re.S)
contents = re.findall(match, res)
for content in contents:
yield {
'cve_id' : content[0].strip(),
'vul_name' : content[1],
'cul_type' : content[2].strip(),
'cve_date' : content[-2].strip(),
'cvs_level': content[-1].strip()
}
path ="E://Asiainfo//webappss//pythonDemo//Test//ali_cnvd_";
# 分页数据放到一个文件里面
def save_content_to_text(content,now_time):
with open(path+now_time+'.txt','a+') as f:
f.write(content + '\n')
def main():
#每天凌晨3点执行main
logger.info("当前任务为:"+'120'+"秒执行一次")
timer = threading.Timer(120, main)
timer.start()
#获取任务的页数
startTime = time.time()
value = memory_cache.get_value("TotalCNVD")
logger.info("当前任务获取上次数量为:"+str(value))
#循环的所有数据
htmls ="";
#获取最新的页数
htmlss = get_onepage_content('https://avd.aliyun.com/search?q=cnvd');
e = etree.HTML(htmlss)
totalList=e.xpath("//div[@class='d-flex justify-content-between align-items-center']/span[@class='text-muted']/text()")
splitToal = str(totalList).split("/")
total = splitToal[1].split('页')
logger.info("当前最新任务获取页数为:"+total[0])
#判断,进入任务,如果不一样说明有更新的,取最新的减去上次的
if(str(value) != total[0]):
#存入这次的total的数量,下次进入时使用
memory_cache.set_value("TotalCNVD",total[0],86400)
#如果取出来的值是None说明是第一次进入,取全量
if(str(value) == 'None') :
pageNum = int(total[0])
else:
pageNum = int(total[0])-int(value)
#正式需要把2替换成pageNum
logger.info("需要循环的页数为:"+str(pageNum))
now_time=datetime.datetime.now().strftime('%Y-%m-%d')
logger.info("当前任务文件路径及名称:"+path+now_time+'.txt')
for pagenum in range(1, 2) :
url = f'https://avd.aliyun.com/search?q=cnvd&page={pagenum}'
html = get_onepage_content(url);
htmls=html+htmls;
for content in show_cve_content(htmls):
cveId = content['cve_id'];
htmldetail = get_onepage_content(f'https://avd.aliyun.com/detail?id={cveId}')
e = etree.HTML(htmldetail)
#漏洞描述
loudong = e.xpath("//div[@class='text-detail pt-2 pb-4']/div/text()")
#解決建议
jianyi = e.xpath("//div[@class='text-detail pt-2 pb-4'][2]/text()")
#参考链接
cankao = e.xpath("//table[@class='table table-sm table-responsive']/tbody/tr/td/a/@href")
#攻击路径
gongji1 = e.xpath("//ul[@class='cvss-breakdown__items']/li[1]/div[@class='cvss-breakdown__desc']/text()")
#攻击复杂度
gongji2 = e.xpath("//ul[@class='cvss-breakdown__items']/li[2]/div[@class='cvss-breakdown__desc']/text()")
#权限要求
gongji3 = e.xpath("//ul[@class='cvss-breakdown__items']/li[3]/div[@class='cvss-breakdown__desc']/text()")
#影响范围
gongji4 = e.xpath("//ul[@class='cvss-breakdown__items']/li[4]/div[@class='cvss-breakdown__desc']/text()")
#用户交互
gongji5 = e.xpath("//ul[@class='cvss-breakdown__items']/li[5]/div[@class='cvss-breakdown__desc']/text()")
#可用性
gongji6 = e.xpath("//ul[@class='cvss-breakdown__items']/li[6]/div[@class='cvss-breakdown__desc']/text()")
#保密性
gongji7 = e.xpath("//ul[@class='cvss-breakdown__items']/li[7]/div[@class='cvss-breakdown__desc']/text()")
#完整性
gongji8 = e.xpath("//ul[@class='cvss-breakdown__items']/li[8]/div[@class='cvss-breakdown__desc']/text()")
content['loudong']=str(loudong).replace("\n", "").replace(" ","")
content['jianyi']=str(jianyi).replace("\n", "").replace(" ","")
content['cankao']=str(cankao).replace("\n", "").replace(" ","")
content['gongji1']=str(gongji1).replace("\n", "").replace(" ","")
content['gongji2']=str(gongji2).replace("\n", "").replace(" ","")
content['gongji3']=str(gongji3).replace("\n", "").replace(" ","")
content['gongji4']=str(gongji4).replace("\n", "").replace(" ","")
content['gongji5']=str(gongji5).replace("\n", "").replace(" ","")
content['gongji6']=str(gongji6).replace("\n", "").replace(" ","")
content['gongji7']=str(gongji7).replace("\n", "").replace(" ","")
content['gongji8']=str(gongji8).replace("\n", "").replace(" ","")
save_content_to_text(str(content),now_time)
endTime = time.time()
logger.info("cve循环",str(pagenum)+"完成,耗时:",endTime - startTime)
else: logger.info("无更新")
# 获取现在时间
now_time = datetime.datetime.now()
# 获取明天时间
next_time = now_time + datetime.timedelta(days=+1)
next_year = next_time.date().year
next_month = next_time.date().month
next_day = next_time.date().day
# 获取明天3点时间
next_time = datetime.datetime.strptime(str(next_year)+"-"+str(next_month)+"-"+str(next_day)+" 03:00:00", "%Y-%m-%d %H:%M:%S")
# # 获取昨天时间
# last_time = now_time + datetime.timedelta(days=-1)
# 获取距离明天3点时间,单位为秒
timer_start_time = (next_time - now_time).total_seconds()
logger.info("获取距离明天3点时间(秒):"+str(timer_start_time))
#定时器,参数为(多少时间后执行,单位为秒,执行的方法)
timer = threading.Timer(timer_start_time, main)
timer.start()
if __name__ == "__main__":
main()
python爬取阿里库cnvd
最新推荐文章于 2024-04-30 11:11:45 发布