#read1.html文件#
The Dormouse's story##The Dormouse's story
##
Once upon a time there were three little sisters; and their names were#Elsie,#Lacie and#Tillie;#and they lived at the bottom of a well.
##
...
#!/usr/bin/env python## -*- coding:UTF-8 -*-
importosimportreimportrequestsfrom bs4 importNavigableStringfrom bs4 importBeautifulSoup
curpath=os.path.dirname(os.path.realpath(__file__))
hmtlpath=os.path.join(curpath,'read1.html')
res=requests.get(hmtlpath)
soup=BeautifulSoup(res.content,features="html.parser")for str insoup.stripped_strings:print(repr(str))
links=soup.find_all(class_="sister")for parent inlinks.parents:if parent isNone:print(parent)else:print(parent.name)print(links.next_sibling)for link inlinks:print(link.next_element)print(link.next_sibling)print(link.privous_element)print(link.privous_sibling)defhas_class_no_id(tag):return tag.has_attr('class') and not tag.has_attr('id')defnot_lacie(href):return href and not re.compile("lacie").search(href)defnot_tillie(href):return href and not re.compile("tillie").search(href)defnot_tillie1(id):return id and not re.compile("link2").search(id)
file=open("soup.html","r",encoding="utf-8")
soup=BeautifulSoup(file,features="lxml")#find_all用法
tags=soup.find_all(re.compile('^b'))
tags=soup.find_all('b')
tags=soup.find_all(['a','b'])
tags=soup.find_all(has_class_no_id)
tags=soup.find_all(True)
tags=soup.find_all(href=not_lacie)for tag intags:print(tag.name)defsurrounded_by_strings(tag):return(isinstance(tag.next_element, NavigableString)andisinstance(tag.previous_element, NavigableString))
tags=soup.find_all(id=not_tillie1)for tag intags:print(tag)
tags=soup.find_all(attrs={"id":"link3"})for tag intags:print(tag)
soup.find_all(recursive=False)
tags=soup.select("body a")
tags=soup.select("p > a")
tags=soup.select("p > #link1")
tags=soup.select("html head title")
tags=soup.select(".sister")
tags=soup.select("[class~=sister]")
tags=soup.select("#link1 + .sister")
tags=soup.select("#link1")
tags=soup.select("a#link1")
tags=soup.select("a[href]")
tags=soup.select('a[href^="http://example"]')
tags=soup.select('a[href$="tillie"]')
tags=soup.select('a[href*=".com/el"]')for tag intags:print(tag)
file=open("soup.html","r",encoding="utf-8")
soup=BeautifulSoup(file,features="html.parser")
soup=BeautifulSoup(file,features="html.parser")print(soup.prettify())print(type(soup))print(type(soup.title))print(type(soup.title.string))print(type(soup.b.string))print(soup.head.name)print(soup.title.name)print(soup.a.name)print(soup.name)
tag=soup.aprint(tag["href"])print(tag.string)print(tag["class"])print(tag.attrs)print(soup.title.string)print(soup.title.name)print(soup.p.attrs)print(soup.a.attrs)print(soup.a["class"])