先贴连接,让各位观众老爷看看,对不对你们的胃口
可以看到,这个网页是html静态的,所以问题变的非常的简单
只需要用request请求网页就可以了
话不多说,直接贴代码
import requests
from urllib.parse import urlencode
from lxml import etree
import pymysql
import time
import xlwt
import xlrd
def makeurl():
# http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=0
baseurl = 'http://ics.cnvd.org.cn/?'
params = {
'tdsourcetag': 's_pctim_aiomsg',
'max': '20'
}
for page in range(MAX_PAGE):
params['offset'] = page * 20
url = baseurl + urlencode(params)
print('url is ', url)
yield url
def get_page_urllist(url):
headers = {
'Host': 'ics.cnvd.org.cn',
'Referer': 'http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=40',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
response = requests.get(url, headers=headers)
return response.text
def parse_urllist(content):
html = etree.HTML(content)
for li in html.xpath('//tbody[@id="tr"]/tr'):
yield li.xpath('td/a/@href')[0]
def get_page(url):
headers = {
'Host': '