html解析方法
- 正则表达式
import re
text1='abcdefg'
pattern1=re.compile(r'c')
matcher1=re.search(pattern1,text1)
print(matcher1[0])
text2="""
<html>
aaa<h1>aa
bbb
</h1>
aaa
</html>
"""
pattern2=re.compile(r'<h1>(.*?)</h1>',re.S)
print(pattern2.findall(text2))
2.bs库
#beautiful_soup库
from bs4 import BeautifulSoup
html="""
<html>
<body>
<a id="aa" herf="https://www.baidu.com">百度一下</a>
<a></a>
<h1>hello</h1>
</body>
</html>
"""
bs=BeautifulSoup(html,'html.parser')
print(bs.a)
print(bs.find_all('a'))
print(bs.a['herf'])
print(bs.a)
3.xpath(推荐使用)
from lxml import etree
html = """(同上的html内容一样)
dom = etree.HTML(html)
#把长字符中物html文档树
print(dom)
#全文匹配,匹配不到返回[], 匹配到的[element, element]
print(dom.xpath('//a/@href'))
# /表示往下一层 //忽略任意层父级目获
# //a=/body/ul/11/a
# ichtmI元索 望的属性
# /@href 取元者属性值
#取元素内容/text()
print(dom. xpath(' //a/text()'))
#属性过滤
print(dom.xpath('//a[@id="second_a"]/text()'))