python find all_python3爬虫（find_all用法等）

最新推荐文章于 2021-07-06 07:44:47 发布

weixin_39867208

最新推荐文章于 2021-07-06 07:44:47 发布

阅读量224

点赞数

文章标签： python find all

#read1.html文件#

The Dormouse's story##

The Dormouse's story

Once upon a time there were three little sisters; and their names were#Elsie,#Lacie and#Tillie;#and they lived at the bottom of a well.

...

#!/usr/bin/env python## -*- coding:UTF-8 -*-

importosimportreimportrequestsfrom bs4 importNavigableStringfrom bs4 importBeautifulSoup

curpath=os.path.dirname(os.path.realpath(__file__))

hmtlpath=os.path.join(curpath,'read1.html')

res=requests.get(hmtlpath)

soup=BeautifulSoup(res.content,features="html.parser")for str insoup.stripped_strings:print(repr(str))

links=soup.find_all(class_="sister")for parent inlinks.parents:if parent isNone:print(parent)else:print(parent.name)print(links.next_sibling)for link inlinks:print(link.next_element)print(link.next_sibling)print(link.privous_element)print(link.privous_sibling)defhas_class_no_id(tag):return tag.has_attr('class') and not tag.has_attr('id')defnot_lacie(href):return href and not re.compile("lacie").search(href)defnot_tillie(href):return href and not re.compile("tillie").search(href)defnot_tillie1(id):return id and not re.compile("link2").search(id)

file=open("soup.html","r",encoding="utf-8")

soup=BeautifulSoup(file,features="lxml")#find_all用法

tags=soup.find_all(re.compile('^b'))

tags=soup.find_all('b')

tags=soup.find_all(['a','b'])

tags=soup.find_all(has_class_no_id)

tags=soup.find_all(True)

tags=soup.find_all(href=not_lacie)for tag intags:print(tag.name)defsurrounded_by_strings(tag):return(isinstance(tag.next_element, NavigableString)andisinstance(tag.previous_element, NavigableString))

tags=soup.find_all(id=not_tillie1)for tag intags:print(tag)

tags=soup.find_all(attrs={"id":"link3"})for tag intags:print(tag)

soup.find_all(recursive=False)

tags=soup.select("body a")

tags=soup.select("p > a")

tags=soup.select("p > #link1")

tags=soup.select("html head title")

tags=soup.select(".sister")

tags=soup.select("[class~=sister]")

tags=soup.select("#link1 + .sister")

tags=soup.select("#link1")

tags=soup.select("a#link1")

tags=soup.select("a[href]")

tags=soup.select('a[href^="http://example"]')

tags=soup.select('a[href$="tillie"]')

tags=soup.select('a[href*=".com/el"]')for tag intags:print(tag)

file=open("soup.html","r",encoding="utf-8")

soup=BeautifulSoup(file,features="html.parser")

soup=BeautifulSoup(file,features="html.parser")print(soup.prettify())print(type(soup))print(type(soup.title))print(type(soup.title.string))print(type(soup.b.string))print(soup.head.name)print(soup.title.name)print(soup.a.name)print(soup.name)

tag=soup.aprint(tag["href"])print(tag.string)print(tag["class"])print(tag.attrs)print(soup.title.string)print(soup.title.name)print(soup.p.attrs)print(soup.a.attrs)print(soup.a["class"])