Python爬虫:查国家安全信息库

本博客仅用于技术讨论,若有侵权,联系作者删除。

这次笔者想对国家信息安全漏洞库进行爬虫,获取漏洞信息。并将每一个漏洞信息存为一个json文件。

一、获取当前总页数

先获取总页数,以便进行for循环爬所有的漏洞数据:

#获取当前总页数
def get_all_page():
    global all_page
    req = requests.get('http://www.cnnvd.org.cn/web/vulnerability/querylist.tag',headers=headers,timeout=40)
    soup = BeautifulSoup(req.text, "lxml")
    message = soup.find('div', class_='page').find('a')
    if hasattr(message, 'text') == False:
        all_page = 1
    else:
        all_page = int(int(message.text.split(':')[1].replace(',',''))/10) + 1

二、获取当前页所有的漏洞链接

获取完总页数之后则获取当前页的每条漏洞的URL,并对每页进行操作。此处笔者进行异常处理,当某个漏洞尝试三次仍然爬取失败,则输出该页面编号:

#获取当前页所有的漏洞链接
def get_now_page_all_url(now_url):
    req = requests.get(now_url,headers=headers,timeout=40)
    soup = BeautifulSoup(req.text, "lxml")
    message = soup.find('div', class_='list_list').find('ul').find_all('li')
    j = 0
    for data in message:
        i = 0
        while True:
            try:
                get_vulnerability_detail(data.div.a['href'])
            except:
                if i > 3:
                    print(str(j)+'***',end='\t')
                    break
                i = i + 1
                continue
            break
        j = j + 1

三、获取当前漏洞信息并存入json文件

获取当前漏洞信息时,笔者对很多数据进行判断,当该数据不存在或者数据为空时,缺省值为暂无。数据存为数组后转化为json格式存储:

#获取当前漏洞信息并存入json文件
def get_vulnerability_detail(url_now):
    req = requests.get(url+url_now,headers=headers,timeout=40)
    soup = BeautifulSoup(req.text, "lxml")
    info = {}
    info["cve_name"] = soup.find('div', class_='detail_xq w770').find('h2').text
    if hasattr(soup.find('div', class_='detail_xq w770').find('ul').span, 'text') == False:
        info["cnnvd_no"] = '暂无'
    elif soup.find('div', class_='detail_xq w770').find('ul').find('span').text == '':
        info["cnnvd_no"] = '暂无'
    else:
        info["cnnvd_no"] = soup.find('div', class_='detail_xq w770').find('ul').find('span').text
        info["cnnvd_no"] = info["cnnvd_no"].split(':')[1]
    message = soup.find('div', class_='detail_xq w770').find('ul').find_all('li')
    if hasattr(message[1].a, 'text') == False:
        info["cnnvd_level"] = '暂无'
    elif message[1].a.text.replace('	','').replace('\n','').replace('\r','')== '':
        info["cnnvd_level"] = '暂无'
    else:
        info["cnnvd_level"] = message[1].a.text.replace('	','').replace('\n','').replace('\r','')
    if hasattr(message[2].a, 'text') == False:
        info["cve_no"] = info["cnnvd_no"]
    elif message[2].a.text.replace('	','').replace('\n','').replace('\r','')== '':
        info["cve_no"] = info["cnnvd_no"]
    else:
        info["cve_no"] = message[2].a.text.replace('	','').replace('\n','').replace('\r','')
    if hasattr(message[3].a, 'text') == False:
        info["catag"] = '暂无'
    elif message[3].a.text.replace('	','').replace('\n','').replace('\r','')== '':
        info["catag"] = '暂无'
    else:
        info["catag"] = message[3].a.text.replace('	','').replace('\n','').replace('\r','')
    if hasattr(message[4].a, 'text') == False:
        info["start_time"] = '暂无'
    elif message[4].a.text.replace('	','').replace('\n','').replace('\r','')== '':
        info["start_time"] = '暂无'
    else:
        info["start_time"] = message[4].a.text.replace('	','').replace('\n','').replace('\r','')
    if hasattr(message[5].a, 'text') == False:
        info["threat_cata"] = '暂无'
    elif message[5].a.text.replace('	','').replace('\n','').replace('\r','') == '':
        info["threat_cata"] = '暂无'
    else:
        info["threat_cata"] = message[5].a.text.replace('	','').replace('\n','').replace('\r','')
    if hasattr(message[6].a, 'text') == False:
        info["update_time"] = '暂无'
    elif message[6].a.text.replace('	','').replace('\n','').replace('\r','')== '':
        info["update_time"] = '暂无'
    else:
        info["update_time"] = message[6].a.text.replace('	','').replace('\n','').replace('\r','') 
    if hasattr(message[7].a, 'text') == False:
        info["company"] = '暂无'
    elif message[7].a.text.replace('	','').replace('\n','').replace('\r','')== '':
        info["company"] = '暂无'
    else:
        info["company"] = message[7].a.text.replace('	','').replace('\n','').replace('\r','')
    if hasattr(message[8].a, 'text') == False:
        info["from"] = '暂无'
    elif message[8].a.text.replace('	','').replace('\n','').replace('\r','')== '':
        info["from"] = '暂无'
    else:
        info["from"] = message[8].a.text.replace('	','').replace('\n','').replace('\r','')
    message_0 = soup.find('div', class_='d_ldjj').find_all('p')
    introduction = ''
    i = 0
    for data_0 in message_0:
        introduction = introduction + data_0.text.replace('	','').replace('\n','').replace('\r','')
        i = i + 1
    if i == 0 or introduction=='':
        info["introduction"] = '暂无'
    else:
        info["introduction"] = introduction.replace('\n','')
    message_1 = soup.find('div', class_='d_ldjj m_t_20').find_all('p')
    bulletin = ''
    i = 0
    for data_1 in message_1:
        if i > 0:
            bulletin = bulletin + '|'
        bulletin = bulletin + data_1.text.replace('	','').replace('\n','').replace('\r','')
        i = i + 1
    if i == 0 or bulletin=='':
        info["bulletin"] = '暂无'
    else:
        info["bulletin"] = bulletin.replace('\n','')
    message_2 = soup.find_all('div', class_='d_ldjj m_t_20')[1].find_all('p')
    reference = ''
    i = 0
    for data_2 in message_2:
        if i > 0:
            reference = reference + '|'
        reference = reference + data_2.text.replace('	','').replace('\n','').replace('\r','')        
        i = i + 1
    if i == 0 or reference=='':
        info["reference"] = '暂无'
    else:
        info["reference"] = reference.replace('\n','')
    message_3 = soup.find_all('div', class_='d_ldjj m_t_20')[2].find_all('li')
    victim = ''
    i = 0
    for data_3 in message_3:
        if i > 0:
            victim = victim + '|'
        victim = victim + data_3.div.a.text.replace('	','').replace('\n','').replace('\r','')
        i = i+ 1
    if i == 0 or victim=='':
        info["victim"] ='暂无'
    else:
        info["victim"] = victim.replace('\n','')
    message_4 = soup.find_all('div', class_='d_ldjj m_t_20')[3].find_all('li')
    patch = ''
    i = 0
    for data_4 in message_4:
        if i > 0:
            patch = patch + '|'
        patch = patch + data_4.div.a.text.replace('	','').replace('\n','').replace('\r','')
        i = i+ 1
    if i == 0 or patch=='':
        info["patch"] ='暂无'
    else:
        info["patch"] = patch
    jsonData = json.dumps(info, ensure_ascii=False)
    fileObject = open('./cve_json/'+str(info["cve_no"])+'.json', 'w',encoding='utf-8')
    fileObject.write(jsonData)
    fileObject.close()

四、主函数

主函数利用循环对每页进行操作,当某页因网路原因未加载成功时,则再进行一次操作,指导页面抓取成功。

#主函数
if __name__ == '__main__':
    init()
    get_all_page()
    for now_page in range(4219,all_page+1):
        while True:
            try:
                print(now_page, end='\t')
                get_now_page_all_url('http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno=' + str(now_page) + '&repairLd=')
                print('end', end='\t')
            except:
                print('error', end='\t')
                time.sleep(3)
                continue
            break

 

  • 1
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
# -*- coding:utf-8 -*- import sys #print (u'系统默认编码为',sys.getdefaultencoding()) default_encoding = 'utf-8' #重新设置编码方式为uft-8 if sys.getdefaultencoding() != default_encoding: reload(sys) sys.setdefaultencoding(default_encoding) #print (u'系统默认编码为',sys.getdefaultencoding()) import requests from bs4 import BeautifulSoup import traceback import re import xlwt def getURLDATA(url): #url = 'http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201901-1014' header={ 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36', 'Connection': 'keep-alive',} r=requests.get(url,headers=header,timeout=30) #r.raise_for_status()抛出异常 html = BeautifulSoup(r.content.decode(),'html.parser') link=html.find(class_='detail_xq w770')#漏洞信息详情 link_introduce=html.find(class_='d_ldjj')#漏洞简介 link_others=html.find_all(class_='d_ldjj m_t_20')#其他 #print(len(link_introduce)) try: #print ("危害等级:"+link.contents[3].contents[3].find('a').text.lstrip().rstrip())#危害等级 list4.append(str(link.contents[3].contents[3].find('a').text.lstrip().rstrip())) except: #print("危害等级:is empty") list4.append("") try: #print ("CVE编号:"+link.contents[3].contents[5].find('a').text.lstrip().rstrip())#CVE编号 list5.append(str(link.contents[3].contents[5].find('a').text.lstrip().rstrip())) except: #print("CVE编号:is empty") list5.append("") try: #print ("漏洞类型:"+link.contents[3].contents[7].find('a').text.lstrip().rstrip())#漏洞类型 list6.append(str(link.contents[3].contents[7].find('a').text.lstrip().rstrip())) except : #print("漏洞类型:is empty") list6.append("") try: #print ("发布时间:"+link.contents[3].contents[9].find('a').text.lstrip().rstrip())#发布时间 list7.append(str(link.contents[3].contents[9].find('a').text.lstrip().rstrip())) except : #print("发布时间:is empty") list7.append("") try: #print ("威胁类型:"+link.contents[3].contents[11].find('a').text.lstrip().rstrip())#威胁类型 list8.append(str(link.contents[3].contents[11].find('a').text.lstrip().rstrip())) except : #print("威胁类型:is empty") list8.append("") try: #print ("更新时间:"+link.contents[3].contents[13].find('a').text.lstrip().rstrip())#更新时间 list9.append(str(link.contents[3].contents[13].find('a').text.lstrip().rstrip())) except : #print("更新时间:is empty") list9.append("") try: #print ("厂商:"+link.contents[3].contents[15].find('a').text.lstrip().rstrip())#厂商 list10.append(str(link.contents[3].contents[15].find('a').text.lstrip().rstrip())) except: #print("厂商:is empty") list10.append("") #link_introduce=html.find(class_='d_ldjj')#漏洞简介 try: link_introduce_data=BeautifulSoup(link_introduce.decode(),'html.parser').find_all(name='p') s="" for i in range(0,len(link_introduce_data)): ##print (link_introduce_data[i].text.lstrip().rstrip()) s=s+str(link_introduce_data[i].text.lstrip().rstrip()) #print(s) list11.append(s) except : list11.append("") if(len(link_others)!=0): #link_others=html.find_all(class_='d_ldjj m_t_20') #print(len(link_others)) try: #漏洞公告 link_others_data1=BeautifulSoup(link_others[0].decode(),'html.parser').find_all(name='p') s="" for i in range(0,len(link_others_data1)): ##print (link_others_data1[i].text.lstrip().rstrip()) s=s+str(link_others_data1[i].text.lstrip().rstrip()) #print(s) list12.append(s) except: list12.append("") try: #参考网址 link_others_data2=BeautifulSoup(link_others[1].decode(),'html.parser').find_all(name='p') s="" for i in range(0,len(link_others_data2)): ##print (link_others_data2[i].text.lstrip().rstrip()) s=s+str(link_others_data2[i].text.lstrip().rstrip()) #print(s) list13.append(s) except: list13.append("") try: #受影响实体 link_others_data3=BeautifulSoup(link_others[2].decode(),'html.parser').find_all('a',attrs={'class':'a_title2'}) s="" for i in range(0,len(link_others_data3)): ##print (link_others_data3[i].text.lstrip().rstrip()) s=s+str(link_others_data3[i].text.lstrip().rstrip()) #print(s) list14.append(s) except: list14.append("") try: #补丁 link_others_data3=BeautifulSoup(link_others[3].decode(),'html.parser').find_all('a',attrs={'class':'a_title2'}) s="" for i in range(0,len(link_others_data3)): ##print (link_others_data3[i].t
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值