Beautiful SOAP 爬网页

最新推荐文章于 2024-02-14 20:50:12 发布

allen_fan_nj

最新推荐文章于 2024-02-14 20:50:12 发布

阅读量530

点赞数

分类专栏： python 网络爬虫文章标签： python 网络爬虫

本文链接：https://blog.csdn.net/fanwb1985/article/details/51834209

版权

python 同时被 2 个专栏收录

1 篇文章 0 订阅

订阅专栏

网络爬虫

1 篇文章 0 订阅

订阅专栏

Python Beautiful SOAP 是一款强大的html解析工具，堪称网络爬虫利器。

下面代码为工具cvelist.csv文件中的CVE ID，分别爬出该CVE信息的一段代码。供记录。

# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import time
import os
import copy
import random
from urllib2 import Request, urlopen, HTTPError
import logging
import json
from bs4 import BeautifulSoup 
import json
import codecs
import gevent
import logging


URL = "http://cve.scap.org.cn/%s.html"
def fetchCVE(sid):
    sid = "CVE-"+str(sid).strip()
    request_url = URL %(sid)
    request_settings = { 'content-type': 'text/plain','Accept-Encoding':'deflate','User-Agent':'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}
    req = Request(request_url,headers=request_settings)
    content = ""
    try:
        response = urlopen(req)
        content = response.read().decode('utf8')
    except HTTPError, e:
        pass
    except Exception,e:
        pass
    return content

def fetchCVEByList(sidList,sidContentPair):
    length = len(sidList)
    count=0
    for sid in sidList:
        count = count+1
        debugstr = "Process %d of %d\n" % (count,length)
        content = fetchCVE(sid)
        sidContentPair[sid.strip()] = content
        fp = open("./file/"+sid.strip(), "w")
        fp.write(content.encode("utf-8"))
        fp.close()

def parseTD(table) :
    soup = BeautifulSoup(table,"lxml")
    tds=soup.find_all("td")
    return tds

def getScoreAndSeverity(table):
    tds = parseTD(table)
    if len(tds)>2:
        return (tds[2].string,tds[1].string)
    else:
        return ("","")
def getPlatform(table):
    tds = parseTD(table)
    content  =""
    for td in tds:
        if td.string!=None:
            content = content + "\n"+td.string
    return content

def getSummary(summary):
    soup = BeautifulSoup(summary,"lxml")
    strongs=soup.find_all("strong")
    return strongs[0].string

def writeCVEList(sidContentPair):
    length = 4152
    counter =0;
    logging.info("begin")
    sidInfoDic= {}
    for sid in sidContentPair.keys():
        debugstr = "process %d of total %d rule: SUCCEED\n"
        counter = counter+1
        content = sidContentPair.get(sid)
        if (content== ""):
            logging.error("sid:"+sid+ " content is none")
            continue
        try:
            soup = BeautifulSoup(content,"lxml")
            summary=soup.find_all("div", {'class':'summary'})
            cvsstable=soup.find_all(id="cvss")
            cpetable=soup.find_all(id="cpe")
            (severity,score) = getScoreAndSeverity(cvsstable[0].encode("utf-8"))
            if(severity=="" or score==""):
                logging.error("sid %s no score", sid)
            name = ""
            for content in summary[0].contents:
                if(content.encode("utf-8").find("strong")!=-1):
                    name = getSummary(content.encode("utf-8"))
            platform = getPlatform(cpetable[0].encode("utf-8"))
            sidInfoDic[sid]=[name,score.strip(),severity,platform]
            logging.info(debugstr , counter,length)
        except Exception,e:
            debugstr = "process %d of total %d rule: FAIL,sid="+sid+"\n"
            logging.exception(e)
            logging.info(debugstr , counter,length)
    #wstr = json.dumps(sidInfoDic, ensure_ascii=False)
    fp = open("result.json", "w")
    json.dump( sidInfoDic,fp, ensure_ascii=False,indent=4)
    fp.close()

def dumpResult():
    sidContentPair = {}
    fp = open("cvelist.csv",'r')
    lines = fp.readlines()
    fp.close()
    length = len(lines)
    threadNumber = length/500+1
    taskPerThread = 500
    threadList = []
    for i in xrange(threadNumber+1):
        taskBegin = i* taskPerThread
        taskEnd = (i+1)* taskPerThread
        if(taskEnd>length):
            taskEnd = length
        t = gevent.spawn(fetchCVEByList, lines[taskBegin:taskEnd],sidContentPair)
        threadList.append(t)
    gevent.joinall(threadList)
    writeCVEList(sidContentPair)

def dumpResultByFile():
    sidContentPair = {}
    #cve 文件，一行一个cve id
    fp = open("cvelist.csv",'r')
    lines = fp.readlines()
    fp.close()
    for line in lines:
        fp = open("./file/"+line.strip(), "r")
        content = fp.read()
        fp.close()
        sidContentPair[line.strip()] = content
    writeCVEList(sidContentPair)
    
if __name__=='__main__':
    #dumpResult()
    dumpResultByFile()

allen_fan_nj

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Beautiful SOAP 爬网页

Python Beautiful SOAP 是一款强大的html解析工具，堪称网络爬虫利器。下面代码为工具cvelist.csv文件中的CVE ID，分别爬出该CVE信息的一段代码。供记录。# -*- coding: utf-8 -*-import sysreload(sys)sys.setdefaultencoding('utf-8')import timeimpor
复制链接

扫一扫

专栏目录