xpath,一直返回空列表
爬虫运行返回为空列表。但是,在谷歌浏览器中用xpath组件
可以定位到。检查了好久,一直没有找到问题所在,请大佬们帮忙看看。
爬取的网站是https://hotel.meituan.com/beijing/
#!/usr/bin/env python
# coding:utf-8
import json
import requests
from lxml import etree
def one_to_page(html):
headers = {
'user-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'Cookie':"__mta=244085050.1574669799871.1574669799871.1575443842607.2; _lxsdk_cuid=16ea1a1d31cc8-0cef6b12dd6cf-2393f61-13c680-16ea1a1d31cc8; ci=1; rvct=1; hotel_city_id=1; hotel_city_info=%7B%22id%22%3A1%2C%22name%22%3A%22%E5%8C%97%E4%BA%AC%22%2C%22pinyin%22%3A%22beijing%22%7D; iuuid=CB7C1ADC74006ADDA5A29B932FA85F4C5409BE451C42F75D60DF0E90967318BD; cityname=%E5%8C%97%E4%BA%AC; _lxsdk=CB7C1ADC74006ADDA5A29B932FA85F4C5409BE451C42F75D60DF0E90967318BD; Hm_lvt_f66b37722f586a240d4621318a5a6ebe=1574749306; __utma=211559370.974893437.1574749307.1574749307.1574749307.1; __utmz=211559370.1574749307.1.1.utmcsr=baidu|utmccn=baidu|utmcmd=organic|utmcct=zt_search; uuid=673d60ba0e18472a82f4.1575426922.1.0.0; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; IJSESSIONID=1a7g3pxb3tz3f1s0mtxdgygcoy; _lxsdk_s=16ecfc52dae-052-33a-b52%7C%7C5"
}
response = requests.get(html, headers=headers)
body = response.content # 获取网页内容获取列表数据
html = etree.HTML(body, etree.HTMLParser()) # 解析HTML文本内容
result = html.xpath("/html/body/mieta[@id='yEnvMX']/main[@id='app']/section[@class='container']/div[@class='g-content clearfix']/div[@class='content-view']/div[@id='main-view']/div[@id='list-view']/div[@class='poi-results']/article[@class='poi-item poi-item-active']/div[@class='info-wrapper']//text()") #
pos = 0
for i in range(10):
if i == 0:
yield result[i:16]
else:
yield result[pos:pos +16] # 返回排名生成器数据
pos +=16
def write_file(data): # 将数据重新组合成字典写入文件并输出
for i in data:
# sul = {
# "店名": i[1],
# "评分": i[2],
# '评论': i[4],
# '类别': i[3]
# }
# sul = [
# i[2],
# i[4],
# i[6],
# i[8],
# i[10],
# i[13],
# ]
with open('D:\ test.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(i, ensure_ascii=False) + '\n') # 必须格式化数据
f.close()
print(i)
return None
def main():
url = 'https://hotel.meituan.com/beijing/'
data = one_to_page(url)
revaule = write_file(data)
if revaule == None:
print('ok')
if __name__ == '__main__':
main()