Python Beautiful SOAP 是一款强大的html解析工具,堪称网络爬虫利器。
下面代码为工具cvelist.csv文件中的CVE ID, 分别爬出该CVE信息的一段代码。供记录。
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import time
import os
import copy
import random
from urllib2 import Request, urlopen, HTTPError
import logging
import json
from bs4 import BeautifulSoup
import json
import codecs
import gevent
import logging
URL = "http://cve.scap.org.cn/%s.html"
def fetchCVE(sid):
sid = "CVE-"+str(sid).strip()
request_url = URL %(sid)
request_settings = { 'content-type': 'text/plain','Accept-Encoding':'deflate','User-Agent':'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}
req = Request(request_url,headers=request_settings)
content = ""
try:
response = urlopen(req)
content = response.read().decode('utf8')
except HTTPError, e:
pass
except Exception,e:
pass
return content
def fetchCVEByList(sidList,sidContentPair):
length = len(sidList)
count=0
for sid in sidList:
count = count+1
debugstr = "Process %d of %d\n" % (count,length)
content = fetchCVE(sid)
sidContentPair[sid.strip()] = content
fp = open("./file/"+sid.strip(), "w")
fp.write(content.encode("utf-8"))
fp.close()
def parseTD(table) :
soup = BeautifulSoup(table,"lxml")
tds=soup.find_all("td")
return tds
def getScoreAndSeverity(table):
tds = parseTD(table)
if len(tds)>2:
return (tds[2].string,tds[1].string)
else:
return ("","")
def getPlatform(table):
tds = parseTD(table)
content =""
for td in tds:
if td.string!=None:
content = content + "\n"+td.string
return content
def getSummary(summary):
soup = BeautifulSoup(summary,"lxml")
strongs=soup.find_all("strong")
return strongs[0].string
def writeCVEList(sidContentPair):
length = 4152
counter =0;
logging.info("begin")
sidInfoDic= {}
for sid in sidContentPair.keys():
debugstr = "process %d of total %d rule: SUCCEED\n"
counter = counter+1
content = sidContentPair.get(sid)
if (content== ""):
logging.error("sid:"+sid+ " content is none")
continue
try:
soup = BeautifulSoup(content,"lxml")
summary=soup.find_all("div", {'class':'summary'})
cvsstable=soup.find_all(id="cvss")
cpetable=soup.find_all(id="cpe")
(severity,score) = getScoreAndSeverity(cvsstable[0].encode("utf-8"))
if(severity=="" or score==""):
logging.error("sid %s no score", sid)
name = ""
for content in summary[0].contents:
if(content.encode("utf-8").find("strong")!=-1):
name = getSummary(content.encode("utf-8"))
platform = getPlatform(cpetable[0].encode("utf-8"))
sidInfoDic[sid]=[name,score.strip(),severity,platform]
logging.info(debugstr , counter,length)
except Exception,e:
debugstr = "process %d of total %d rule: FAIL,sid="+sid+"\n"
logging.exception(e)
logging.info(debugstr , counter,length)
#wstr = json.dumps(sidInfoDic, ensure_ascii=False)
fp = open("result.json", "w")
json.dump( sidInfoDic,fp, ensure_ascii=False,indent=4)
fp.close()
def dumpResult():
sidContentPair = {}
fp = open("cvelist.csv",'r')
lines = fp.readlines()
fp.close()
length = len(lines)
threadNumber = length/500+1
taskPerThread = 500
threadList = []
for i in xrange(threadNumber+1):
taskBegin = i* taskPerThread
taskEnd = (i+1)* taskPerThread
if(taskEnd>length):
taskEnd = length
t = gevent.spawn(fetchCVEByList, lines[taskBegin:taskEnd],sidContentPair)
threadList.append(t)
gevent.joinall(threadList)
writeCVEList(sidContentPair)
def dumpResultByFile():
sidContentPair = {}
#cve 文件,一行一个cve id
fp = open("cvelist.csv",'r')
lines = fp.readlines()
fp.close()
for line in lines:
fp = open("./file/"+line.strip(), "r")
content = fp.read()
fp.close()
sidContentPair[line.strip()] = content
writeCVEList(sidContentPair)
if __name__=='__main__':
#dumpResult()
dumpResultByFile()