正则是系统的 xpath和bs4是属于第三方库
bs4 和 xpath 都是用来解析html数据的
相比之下,xpath的速度会快一点
正则使用元字符
xpath和bs4将获取的源码转化成一个对象
正则无层级结构
只有先后顺序
代码如下:
import re
from lxml import etree
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
url = 'http://www.ivsky.com/tupian/haiyangshijie/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'
}
def get_code_with_url(url):
request = Request(url, headers=headers)
response = urlopen(request)
code = response.read().decode()
return code
def reMethod(code):
pattern = re.compile(r'<img src="(.*?)".*?alt="(.*?)">', re.S)
#print(pattern)
result = pattern.findall(code)
print(result)
def xpath(code):
root = etree.HTML(code)
image = root.xpath('//a[@href]/img')
#print(image)
# 打印类型为<Element img at 0x2e95120>
for value in image:
name = value.get('alt')
img = value.get('src')
print(name, img)
def beautifulsoup(code):
soup = BeautifulSoup(code, 'lxml')
# print(soup)
# 打印类型为 < class 'bs4.BeautifulSoup'>
img = soup.select('ul.ali li img')
print(img)
code = get_code_with_url(url)
reMethod(code)
xpath(code)
beautifulsoup(code)