BeautifulSoup编写脚本实现内网网页链接爬取

#!/usr/bin/python3 env

# -*- coding:utf-8 -*-

"""

auther:xiaohong.d

data:2020-04-30

description: this python3 script for groovy script use parameters to filter one module engine tag list

from now on. this script need python3 requests,json,BeautifulSoup only running on slave node

"""

import requests,json

from bs4 import BeautifulSoup

 

class RequestsParam:

"""参数组合"""

def __init__(self,url,prefix,module,suffix,timeout):

self.path = url+prefix+module+suffix

self.timeout = timeout

 

class Retrieve:

"""获取内容"""

def __init__(self,requestsparam):

self.req = requestsparam

 

def getPage(self):

r = requests.get(self.req.path,timeout=self.req.timeout)

return r.text

 

class ParsingHTML:

"""解析返回"""

def __init__(self,content):

self.content = content

 

def parseVersionLi(self):

soup = BeautifulSoup(self.content,'lxml')

newlist = [ i.string.replace("/", "") for i in soup.find_all("a")]

return newlist[1:]

if __name__ == '__main__':

try:

opts, args = getopt.getopt(sys.argv[1:], "ha:")

except Exception as ex:

print (ex)

sys.exit()

args = "shield"

for tmp1, tmp2 in opts:

if tmp1 == "-a":

args = tmp2

if tmp1 == "-h":

print ("this is a help message")

 

rp = RequestsParam('http://*****:9999/','box/',args+'/','ine/',0.300)

retrieve = Retrieve(rp)

res = retrieve.getPage()

ph = ParsingHTML(res)

li = ph.parseVersionLi()

for i in li:

print (i)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值