静态网页内容爬取(python)

以网站漏洞扫描为例:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pymysql as MySQLdb
import re
import os
#插入数据
def insertData(lis):
cursor = conn.cursor()
try:
insertSql = ‘insert into xxx(base,dev_ip,dev_os,sys_version,plug_version,scan_start,scan_end,bug_name,bug_describe,bug_solution,bug_score,bug_dplug,bug_founddate,bug_cve) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)’
cursor.execute(insertSql,(lis[0],lis[1],lis[2],lis[3],lis[4],lis[5],lis[6],lis[7],lis[8],lis[9],lis[10],lis[11],lis[12],lis[13]))
print(’--------------------------’)
except Exception as e:
print(e)
conn.commit()
cursor.close()

def getFileNamelist():
#获取当前目录下的所有文件 # fileList = “” # for root, dirs, files in os.walk(r"filepath"): # fileList = files # fileNameList = [] # for fi in fileList: #获取文件名 # fileNameList.append(os.path.splitext(fi)[0]) # return fileNameList

#获取IP
def getIp():
#for file in fileNameList:
html = urlopen(r’filepath.html’)
soup = BeautifulSoup(html, “html.parser”)
allReportTable = soup.select(‘table[class = “report_table”]’)
zjMess = allReportTable[7]
mess = zjMess.select(‘tr[class = “odd”]’)
b = zjMess.select(‘tr[class = “even”]’)
mess += b
ipList = []
for m in mess:
ip = m.select(‘td’)[0].get_text()
ipList.append(ip)
return ipList

#--------获取单个IP的漏洞详情--------
def getDetilMess():
bugDetill = soup.find_all(id=“vul_detail”)
#判断是否有漏洞
if len(bugDetill)>0:
bugDetilList = bugDetill[0]
listAll = []
bdl = bugDetilList.select(‘table[class = “report_table plumb”]’)
for bd in bdl:
#获取漏项详情的th标签
bdlTh = bd.select(“th”)
i = 0
#定义长度为6的空数组存放漏洞详情
list = [’’]*6
#遍历全部呢th标签
for bdStr in bdlTh:
if bdStr.get_text() == “详细描述”:
#detailedDess = bd.select(‘td’)[i].get_text().strip()
detailedDes = re.sub(r"\s+", “”, bd.select(‘td’)[i].get_text().strip())
#list.append(detailedDes)
list[i] = detailedDes
elif bdStr.get_text() == “解决办法”:
#solutions = bd.select(‘td’)[i].get_text().strip()
solution = re.sub(r"\s+", “”, bd.select(‘td’)[i].get_text().strip())
#list.append(solution)
list[i] = solution
elif bdStr.get_text() == “威胁分值”:
#score = bd.select(‘td’)[i].get_text().strip()
scoreInt = float(bd.select(‘td’)[i].get_text().strip())
if 7.0<=scoreInt<=10.0:
score = “高”
#list.append(score)
list[i] = score
elif 4.0<=scoreInt<7.0:
score = “中”
#list.append(score)
list[i] = score
else:
score = “低”
#list.append(score)
list[i] = score
elif bdStr.get_text() == “危险插件”:
dPlug = bd.select(‘td’)[i].get_text().strip()
#list.append(dPlug)
list[i] = dPlug
elif bdStr.get_text() == “发现日期”:
foundDate = bd.select(‘td’)[i].get_text().strip()
#list.append(foundDate)
list[i] = foundDate
elif bdStr.get_text() == “CVE编号”:
cve = bd.select(‘td’)[i].get_text().strip()
#list.append(cve)
list[i] = cve
i += 1
listAll.append(list)
#调用获取漏洞列表的方法
bugNameList = getBugName(bugDetilList)
#调用生成漏洞与描述的字典
bugDict = getBugNameDict(bugNameList,listAll)
for bug in bugNameList:
#获取单个漏洞的详细信息
getSingleBug(bugDict,bug)
else:
print(“该IP无漏洞”)

#-----------获取BUGlist-------------
def getBugName(bugDetilList):
bugNameListH = bugDetilList.select(‘span[class = “level_danger_high”]’)
bugNameListM = bugDetilList.select(‘span[class = “level_danger_middle”]’)
bugNameListL = bugDetilList.select(‘span[class = “level_danger_low”]’)
bugNameList = bugNameListH + bugNameListM + bugNameListL
return bugNameList
#-----------获取BUG与详情的字典-------------
def getBugNameDict(bugNameList,listAll):
i = 0
bugDict = dict()
for bugName in bugNameList:
bugDict[bugName.get_text()] = listAll[i]
i += 1
return bugDict

#-----------获取单个BUG-------------
def getSingleBug(bugDict,bug):
#定义空数组
singleIplist = []
singleIplist.append(“name”)
singleIplist.append(ip)
singleIplist.append(os)
singleIplist.append(osEd)
singleIplist.append(plugEd)
singleIplist.append(scanStart)
singleIplist.append(scanEnd)
singleIplist.append(bug.get_text())
bugContent = bugDict[bug.get_text()]
detailedDesContent = bugContent[0]
singleIplist.append(detailedDesContent)
solutionContent = bugContent[1]
singleIplist.append(solutionContent)
scoreContent = bugContent[2]
singleIplist.append(scoreContent)
dPlugContent = bugContent[3]
singleIplist.append(dPlugContent)
foundDateContent = bugContent[4]
singleIplist.append(foundDateContent)
cveContent = bugContent[5]
singleIplist.append(cveContent)
print(singleIplist)
insertData(singleIplist)

if name == “main”:
conn = MySQLdb.connect(“数据库连接信息”)
#fileNameList = getFileNamelist()
iplist = getIp()
p = 0
for i in iplist:
ip = ‘’
os = ‘’
osEd = ‘’
plugEd = ‘’
scanStart = ‘’
scanEnd = ‘’
html = urlopen(r’filename’)
soup = BeautifulSoup(html, “html.parser”)
messDetil = soup.select(‘table[class = “report_table plumb”]’)
singleMessDetil = messDetil[1]
singleTopItem = singleMessDetil.select(‘th’)
x = 0
for singleTopMess in singleTopItem:
if singleTopMess.get_text() == “IP地址”:
ip = singleMessDetil.select(‘td’)[x].get_text()
elif singleTopMess.get_text() == “操作系统”:
os = singleMessDetil.select(‘td’)[x].get_text()
elif singleTopMess.get_text() == “系统版本”:
osEd = singleMessDetil.select(‘td’)[x].get_text()
elif singleTopMess.get_text() == “插件版本”:
plugEd = singleMessDetil.select(‘td’)[x].get_text()
elif singleTopMess.get_text() == “扫描起始时间”:
scanStart = singleMessDetil.select(‘td’)[x].get_text()
elif singleTopMess.get_text() == “扫描结束时间”:
scanEnd = singleMessDetil.select(‘td’)[x].get_text()
x += 1
getDetilMess()
p += 1
conn.close()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值