分享一点干货,以前喜欢用正则,现在尝试用xpath写
# -*- coding: utf-8 -*-
# Author:Mr.Dai
# @Time : 2019/11/15 11:59
import requests
from lxml import etree
def MainConnect(MainUrl,header):
# 获取对应url内容
html = requests.get(url=MainUrl,headers=header).text # print(html)
# xpath来选择
# connect = etree.HTML(html).xpath('//div[@class="pop_box_city"]/a[{}]/text()'.format())[0]
# 具体城市url
liUrl = []
# 具体城市名称
liCity = []
# 将城市名和对应url匹配
item = {}
for i in range(0,40):
# 取出对应页面信息
detail_Url = etree.HTML(html).xpath('//dl[@class="seo_hot sta_unfold"]/dd/a/@href')
# 构造详情页
liUrl.append(MainUrl+detail_Url[i])
# 取出对应的城市名
detail_City = etree.HTML(html).xpath('//dl[@class="seo_hot sta_unfold"]/dd/a/text()')
liCity.append(detail_City[i])
# 存进dict中
for j in range(0,40):
item[liCity[j]] = liUrl[j]
# print(item)
return item
if __name__ == '__main__':
# 携程酒店首页
MainUrl = "https://hotels.ctrip.com"
# header信息
header = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
}
detail = MainConnect(MainUrl, header)
# print(detail)
人生苦短, 我选python