python find all_python3爬虫(find_all用法等)

#read1.html文件#

The Dormouse's story##

The Dormouse's story

#

#

Once upon a time there were three little sisters; and their names were#Elsie,#Lacie and#Tillie;#and they lived at the bottom of a well.

#

#

...

#!/usr/bin/env python## -*- coding:UTF-8 -*-

importosimportreimportrequestsfrom bs4 importNavigableStringfrom bs4 importBeautifulSoup

curpath=os.path.dirname(os.path.realpath(__file__))

hmtlpath=os.path.join(curpath,'read1.html')

res=requests.get(hmtlpath)

soup=BeautifulSoup(res.content,features="html.parser")for str insoup.stripped_strings:print(repr(str))

links=soup.find_all(class_="sister")for parent inlinks.parents:if parent isNone:print(parent)else:print(parent.name)print(links.next_sibling)for link inlinks:print(link.next_element)print(link.next_sibling)print(link.privous_element)print(link.privous_sibling)defhas_class_no_id(tag):return tag.has_attr('class') and not tag.has_attr('id')defnot_lacie(href):return href and not re.compile("lacie").search(href)defnot_tillie(href):return href and not re.compile("tillie").search(href)defnot_tillie1(id):return id and not re.compile("link2").search(id)

file=open("soup.html","r",encoding="utf-8")

soup=BeautifulSoup(file,features="lxml")#find_all用法

tags=soup.find_all(re.compile('^b'))

tags=soup.find_all('b')

tags=soup.find_all(['a','b'])

tags=soup.find_all(has_class_no_id)

tags=soup.find_all(True)

tags=soup.find_all(href=not_lacie)for tag intags:print(tag.name)defsurrounded_by_strings(tag):return(isinstance(tag.next_element, NavigableString)andisinstance(tag.previous_element, NavigableString))

tags=soup.find_all(id=not_tillie1)for tag intags:print(tag)

tags=soup.find_all(attrs={"id":"link3"})for tag intags:print(tag)

soup.find_all(recursive=False)

tags=soup.select("body a")

tags=soup.select("p > a")

tags=soup.select("p > #link1")

tags=soup.select("html head title")

tags=soup.select(".sister")

tags=soup.select("[class~=sister]")

tags=soup.select("#link1 + .sister")

tags=soup.select("#link1")

tags=soup.select("a#link1")

tags=soup.select("a[href]")

tags=soup.select('a[href^="http://example"]')

tags=soup.select('a[href$="tillie"]')

tags=soup.select('a[href*=".com/el"]')for tag intags:print(tag)

file=open("soup.html","r",encoding="utf-8")

soup=BeautifulSoup(file,features="html.parser")

soup=BeautifulSoup(file,features="html.parser")print(soup.prettify())print(type(soup))print(type(soup.title))print(type(soup.title.string))print(type(soup.b.string))print(soup.head.name)print(soup.title.name)print(soup.a.name)print(soup.name)

tag=soup.aprint(tag["href"])print(tag.string)print(tag["class"])print(tag.attrs)print(soup.title.string)print(soup.title.name)print(soup.p.attrs)print(soup.a.attrs)print(soup.a["class"])

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值