本博客仅用于技术讨论,若有侵权,联系作者删除。
这次笔者想对国家信息安全漏洞库进行爬虫,获取漏洞信息。并将每一个漏洞信息存为一个json文件。
一、获取当前总页数
先获取总页数,以便进行for循环爬所有的漏洞数据:
#获取当前总页数
def get_all_page():
global all_page
req = requests.get('http://www.cnnvd.org.cn/web/vulnerability/querylist.tag',headers=headers,timeout=40)
soup = BeautifulSoup(req.text, "lxml")
message = soup.find('div', class_='page').find('a')
if hasattr(message, 'text') == False:
all_page = 1
else:
all_page = int(int(message.text.split(':')[1].replace(',',''))/10) + 1
二、获取当前页所有的漏洞链接
获取完总页数之后则获取当前页的每条漏洞的URL,并对每页进行操作。此处笔者进行异常处理,当某个漏洞尝试三次仍然爬取失败,则输出该页面编号:
#获取当前页所有的漏洞链接
def get_now_page_all_url(now_url):
req = requests.get(now_url,headers=headers,timeout=40)
soup = BeautifulSoup(req.text, "lxml")
message = soup.find('div', class_='list_list').find('ul').find_all('li')
j = 0
for data in message:
i = 0
while True:
try:
get_vulnerability_detail(data.div.a['href'])
except:
if i > 3:
print(str(j)+'***',end='\t')
break
i = i + 1
continue
break
j = j + 1
三、获取当前漏洞信息并存入json文件
获取当前漏洞信息时,笔者对很多数据进行判断,当该数据不存在或者数据为空时,缺省值为暂无。数据存为数组后转化为json格式存储:
#获取当前漏洞信息并存入json文件
def get_vulnerability_detail(url_now):
req = requests.get(url+url_now,headers=headers,timeout=40)
soup = BeautifulSoup(req.text, "lxml")
info = {}
info["cve_name"] = soup.find('div', class_='detail_xq w770').find('h2').text
if hasattr(soup.find('div', class_='detail_xq w770').find('ul').span, 'text') == False:
info["cnnvd_no"] = '暂无'
elif soup.find('div', class_='detail_xq w770').find('ul').find('span').text == '':
info["cnnvd_no"] = '暂无'
else:
info["cnnvd_no"] = soup.find('div', class_='detail_xq w770').find('ul').find('span').text
info["cnnvd_no"] = info["cnnvd_no"].split(':')[1]
message = soup.find('div', class_='detail_xq w770').find('ul').find_all('li')
if hasattr(message[1].a, 'text') == False:
info["cnnvd_level"] = '暂无'
elif message[1].a.text.replace(' ','').replace('\n','').replace('\r','')== '':
info["cnnvd_level"] = '暂无'
else:
info["cnnvd_level"] = message[1].a.text.replace(' ','').replace('\n','').replace('\r','')
if hasattr(message[2].a, 'text') == False:
info["cve_no"] = info["cnnvd_no"]
elif message[2].a.text.replace(' ','').replace('\n','').replace('\r','')== '':
info["cve_no"] = info["cnnvd_no"]
else:
info["cve_no"] = message[2].a.text.replace(' ','').replace('\n','').replace('\r','')
if hasattr(message[3].a, 'text') == False:
info["catag"] = '暂无'
elif message[3].a.text.replace(' ','').replace('\n','').replace('\r','')== '':
info["catag"] = '暂无'
else:
info["catag"] = message[3].a.text.replace(' ','').replace('\n','').replace('\r','')
if hasattr(message[4].a, 'text') == False:
info["start_time"] = '暂无'
elif message[4].a.text.replace(' ','').replace('\n','').replace('\r','')== '':
info["start_time"] = '暂无'
else:
info["start_time"] = message[4].a.text.replace(' ','').replace('\n','').replace('\r','')
if hasattr(message[5].a, 'text') == False:
info["threat_cata"] = '暂无'
elif message[5].a.text.replace(' ','').replace('\n','').replace('\r','') == '':
info["threat_cata"] = '暂无'
else:
info["threat_cata"] = message[5].a.text.replace(' ','').replace('\n','').replace('\r','')
if hasattr(message[6].a, 'text') == False:
info["update_time"] = '暂无'
elif message[6].a.text.replace(' ','').replace('\n','').replace('\r','')== '':
info["update_time"] = '暂无'
else:
info["update_time"] = message[6].a.text.replace(' ','').replace('\n','').replace('\r','')
if hasattr(message[7].a, 'text') == False:
info["company"] = '暂无'
elif message[7].a.text.replace(' ','').replace('\n','').replace('\r','')== '':
info["company"] = '暂无'
else:
info["company"] = message[7].a.text.replace(' ','').replace('\n','').replace('\r','')
if hasattr(message[8].a, 'text') == False:
info["from"] = '暂无'
elif message[8].a.text.replace(' ','').replace('\n','').replace('\r','')== '':
info["from"] = '暂无'
else:
info["from"] = message[8].a.text.replace(' ','').replace('\n','').replace('\r','')
message_0 = soup.find('div', class_='d_ldjj').find_all('p')
introduction = ''
i = 0
for data_0 in message_0:
introduction = introduction + data_0.text.replace(' ','').replace('\n','').replace('\r','')
i = i + 1
if i == 0 or introduction=='':
info["introduction"] = '暂无'
else:
info["introduction"] = introduction.replace('\n','')
message_1 = soup.find('div', class_='d_ldjj m_t_20').find_all('p')
bulletin = ''
i = 0
for data_1 in message_1:
if i > 0:
bulletin = bulletin + '|'
bulletin = bulletin + data_1.text.replace(' ','').replace('\n','').replace('\r','')
i = i + 1
if i == 0 or bulletin=='':
info["bulletin"] = '暂无'
else:
info["bulletin"] = bulletin.replace('\n','')
message_2 = soup.find_all('div', class_='d_ldjj m_t_20')[1].find_all('p')
reference = ''
i = 0
for data_2 in message_2:
if i > 0:
reference = reference + '|'
reference = reference + data_2.text.replace(' ','').replace('\n','').replace('\r','')
i = i + 1
if i == 0 or reference=='':
info["reference"] = '暂无'
else:
info["reference"] = reference.replace('\n','')
message_3 = soup.find_all('div', class_='d_ldjj m_t_20')[2].find_all('li')
victim = ''
i = 0
for data_3 in message_3:
if i > 0:
victim = victim + '|'
victim = victim + data_3.div.a.text.replace(' ','').replace('\n','').replace('\r','')
i = i+ 1
if i == 0 or victim=='':
info["victim"] ='暂无'
else:
info["victim"] = victim.replace('\n','')
message_4 = soup.find_all('div', class_='d_ldjj m_t_20')[3].find_all('li')
patch = ''
i = 0
for data_4 in message_4:
if i > 0:
patch = patch + '|'
patch = patch + data_4.div.a.text.replace(' ','').replace('\n','').replace('\r','')
i = i+ 1
if i == 0 or patch=='':
info["patch"] ='暂无'
else:
info["patch"] = patch
jsonData = json.dumps(info, ensure_ascii=False)
fileObject = open('./cve_json/'+str(info["cve_no"])+'.json', 'w',encoding='utf-8')
fileObject.write(jsonData)
fileObject.close()
四、主函数
主函数利用循环对每页进行操作,当某页因网路原因未加载成功时,则再进行一次操作,指导页面抓取成功。
#主函数
if __name__ == '__main__':
init()
get_all_page()
for now_page in range(4219,all_page+1):
while True:
try:
print(now_page, end='\t')
get_now_page_all_url('http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno=' + str(now_page) + '&repairLd=')
print('end', end='\t')
except:
print('error', end='\t')
time.sleep(3)
continue
break