python工业控制_Python爬取 工控行业系统漏洞

先贴连接,让各位观众老爷看看,对不对你们的胃口

bVbtsTW?w=2560&h=1348

可以看到,这个网页是html静态的,所以问题变的非常的简单

只需要用request请求网页就可以了

话不多说,直接贴代码

import requests

from urllib.parse import urlencode

from lxml import etree

import pymysql

import time

import xlwt

import xlrd

def makeurl():

# http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=0

baseurl = 'http://ics.cnvd.org.cn/?'

params = {

'tdsourcetag': 's_pctim_aiomsg',

'max': '20'

}

for page in range(MAX_PAGE):

params['offset'] = page * 20

url = baseurl + urlencode(params)

print('url is ', url)

yield url

def get_page_urllist(url):

headers = {

'Host': 'ics.cnvd.org.cn',

'Referer': 'http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=40',

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'

}

response = requests.get(url, headers=headers)

return response.text

def parse_urllist(content):

html = etree.HTML(content)

for li in html.xpath('//tbody[@id="tr"]/tr'):

yield li.xpath('td/a/@href')[0]

def get_page(url):

headers = {

'Host': 'www.cnvd.org.cn',

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'

}

response = requests.get(url, headers=headers)

return response.text

def parse_page(content, url):

html = etree.HTML(content)

item = {}

item['url'] = url

item['标题'] = str(html.xpath('//div[@class="blkContainerSblk"]/h1/text()')[0])

item['CNVD_ID'] = ''.join(

[i.strip() for i in html.xpath('//tbody/tr/td[text()="CNVD-ID"]/following-sibling::*[1]//text()')])

item['公开日期'] = ''.join(

[i.strip() for i in html.xpath('//tbody/tr/td[text()="公开日期"]/following-sibling::*[1]//text()')])

item['危害级别'] = ''.join([i.strip().replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '') for i in

html.xpath('//tbody/tr/td[text()="危害级别"]/following-sibling::*[1]//text()')])

item['影响产品'] = ''.join(

[i.strip() for i in html.xpath('//tbody/tr/td[text()="影响产品"]/following-sibling::*[1]//text()')])

try:

item['BUGTRAQ_ID'] = ''.join(

[i.strip() for i in html.xpath('//tbody/tr/td[text()="BUGTRAQ ID"]/following-sibling::*[1]//text()')])

except:

item['BUGTRAQ_ID'] = ''

item['CVE_ID'] = ''.join(

[i.strip() for i in

html.xpath('//tbody/tr/td[text()="CVE ID"]/following-sibling::*[1]//text()')]) + ' ' + ''.join(

[i.strip() for i in html.xpath('//tbody/tr/td[text()="CVE ID"]/following-sibling::*[1]//@href')])

item['漏洞描述'] = ''.join(

[i.strip() for i in html.xpath('//tbody/tr/td[text()="漏洞描述"]/following-sibling::*[1]//text()')])

item['漏洞类型'] = ''.join(

[i.strip() for i in html.xpath('//tbody/tr/td[text()="漏洞类型"]/following-sibling::*[1]//text()')])

item['参考链接'] = ''.join(

[i.strip() for i in html.xpath('//tbody/tr/td[text()="参考链接"]/following-sibling::*[1]//text()')])

item['漏洞解决方案'] = ''.join(

[i.strip() for i in html.xpath('//tbody/tr/td[text()="漏洞解决方案"]/following-sibling::*[1]//text()')])

item['厂商补丁'] = ''.join(

[i.strip() for i in html.xpath(

'//tbody/tr/td[text()="厂商补丁"]/following-sibling::*[1]//text()')]) + ' http://www.cnvd.org.cn' + ''.join(

[i.strip() for i in html.xpath('//tbody/tr/td[text()="厂商补丁"]/following-sibling::*[1]//@href')])

item['验证信息'] = ''.join(

[i.strip() for i in html.xpath('//tbody/tr/td[text()="验证信息"]/following-sibling::*[1]//text()')])

item['报送时间'] = ''.join(

[i.strip() for i in html.xpath('//tbody/tr/td[text()="报送时间"]/following-sibling::*[1]//text()')])

item['收录时间'] = ''.join(

[i.strip() for i in html.xpath('//tbody/tr/td[text()="收录时间"]/following-sibling::*[1]//text()')])

item['更新时间'] = ''.join(

[i.strip() for i in html.xpath('//tbody/tr/td[text()="更新时间"]/following-sibling::*[1]//text()')])

item['漏洞附件'] = ''.join(

[i.strip() for i in html.xpath('//tbody/tr/td[text()="漏洞附件"]/following-sibling::*[1]//text()')])

return item

def save_data(index, item, workbook):

sheet = workbook.get_sheet('sheet1') # 创建一个sheet表格

for col, value in enumerate(item.values()):

sheet.write(index, col, value)

workbook.save(filename)

print('保存成功')

def excel_prepare(heads):

workbook = xlwt.Workbook()

sheet = workbook.add_sheet('sheet1', cell_overwrite_ok=True) # 创建一个sheet表格

for col, value in enumerate(heads):

sheet.write(0, col, value)

return workbook

def urlisexist(url, urlset):

if url in urlset:

return True

else:

return False

def getallurl(filename):

workbook = xlrd.open_workbook(filename)

sheet1 = workbook.sheet_by_name('sheet1')

results = sheet1.col_values(0, 1)

return results

def read_old(filename):

workbook = xlrd.open_workbook(filename)

sheet1 = workbook.sheet_by_name('sheet1')

alloldset = []

for index in range(sheet1.nrows):

alloldset.append(sheet1.row_values(index))

return alloldset, sheet1.nrows

def save_old(index, olditem):

sheet = workbook.get_sheet('sheet1') # 创建一个sheet表格

for col, value in enumerate(olditem):

sheet.write(index, col, value)

workbook.save(filename)

if __name__ == '__main__':

# http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=0

# 睡眠时间

TIMESLEEP = 0

filename = '工程控制系统漏洞.xls'

MAX_PAGE = 96

heads = ['url',

'标题',

'CNVD_ID',

'公开日期',

'危害级别',

'影响产品',

'BUGTRAQ_ID',

'CVE_ID',

'漏洞描述',

'漏洞类型',

'参考链接',

'漏洞解决方案',

'厂商补丁',

'验证信息',

'报送时间',

'收录时间',

'更新时间',

'漏洞附件']

try:

alloldset, length = read_old(filename)

except:

alloldset = []

length = 1

workbook = excel_prepare(heads)

for index, olditem in enumerate(alloldset):

save_old(index, olditem)

try:

urlset = getallurl(filename)

except:

urlset = []

index = length

for urlofpage in makeurl():

pagelistcontent = get_page_urllist(urlofpage)

for url in parse_urllist(pagelistcontent):

print('url is >>>', url)

if not urlisexist(url, urlset):

time.sleep(TIMESLEEP)

result = get_page(url)

item = parse_page(result, url)

print('item is >>>', item)

save_data(index, item, workbook)

index = index + 1

workbook.save(filename)

不懂的地方,下方评论提问

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,下面是一个简单的Python爬虫爬取网页数据并解析数据的教程: 1. 确定目标网站和要爬取的信息 首先,需要确定要爬取的网站和要提取的信息。可以使用Python的requests库向网站发送HTTP请求获取HTML源代码,并使用BeautifulSoup库解析HTML文档获取目标数据。 例如,我们要爬取CSDN博客的文章标题和链接,可以先打开CSDN博客主页,右键查看网页源代码,找到文章标题和链接所在的HTML标签。 2. 发送HTTP请求获取HTML源代码 接下来,使用Python的requests库向网站发送HTTP请求,获取HTML源代码。 ``` import requests url = 'https://blog.csdn.net/' response = requests.get(url) html = response.text ``` 3. 解析HTML文档获取目标数据 使用BeautifulSoup库解析HTML文档,获取目标数据。 ``` from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html.parser') titles = soup.find_all('div', class_='title') for title in titles: link = title.find('a').get('href') title_text = title.find('a').text.strip() print(title_text, link) ``` 上述代码中,通过`find_all`方法找到所有class属性为"title"的div标签,然后在每个div标签中找到第一个a标签,获取链接和标题文本。 4. 完整代码 ``` import requests from bs4 import BeautifulSoup url = 'https://blog.csdn.net/' response = requests.get(url) html = response.text soup = BeautifulSoup(html, 'html.parser') titles = soup.find_all('div', class_='title') for title in titles: link = title.find('a').get('href') title_text = title.find('a').text.strip() print(title_text, link) ``` 以上就是一个简单的Python爬虫爬取网页数据并解析数据的教程。需要注意的是,在爬取网站数据时要遵守网站的爬虫协议,避免被网站封禁IP。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值