通过对HTML格式转换成XML的格式后,再对转换成XML格式后的内容进行xpath匹配处理,得到想要爬取的内容!
import requests
from lxml import etree
# 1.获取html页面
url = 'http://www.langlang2017.com/'
response = requests.get(url)
content = response.content.decode('u1tf-8')
# 2.把页面转换成树状结构
tree = etree.HTML(content)
# 方法一
div_list = tree.xpath('//div/text()')
print(div_list[-6])
# 方法二
tel = tree.xpath('//div[@class="dianhua"]/text()')
print(tel[0])
addr = tree.xpath('//div[@class="dizhi"]/text()')
print(addr[0])
# info = tree.xpath('//li/img/@src')
# print(info)
li_list = tree.xpath('//div[@class="banner_box"]/ul/li')
for li in li_list:
# 获取图片和链接
src = li.xpath('./img/@src')[0]
alt = li.xpath('./img/@alt')[0]
print(src, alt)