import requests
from lxml import etree
headers1={
'user-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0'
}
for i in range(1,101):
url_=f"https://cs.lianjia.com/zufang/pg{i}/"
response=requests.get(url_,headers=headers1)
tree=etree.HTML(response.text)
#分析发现 每一条数据 都是在div[class='zu-itemmod clearfix']元素下,我们只要定位到这个div就可以拿到每一条数据
div_list=tree.xpath('//div[@class="content__list--item"]')
#print(len(div_list))
for data in div_list :
title=data.xpath('.//p[@class="content__list--item--title"]/a[@class="twoline"]/text()')#标题
info = data.xpath('.//p[@class="content__list--item--des"]//text()')#房屋详情
# address = data.xpath('.//div[@class="zu-info"]/address[@class="details-item tag"]//text()')#房屋地址
label = data.xpath('.//p[@class="content__list--item--bottom oneline"]//text()')#房屋标签
price = data.xpath('.//span[@class="content__list--item-price"]//text()')[0]#价格
title1 = ''.join(title).replace('\n', '').replace(' ', '')
info1=''.join(info).replace('\n','').replace(' ','')
# address1=''.join(address).replace('\n','').replace(' ','').replace('\xa0','')
label1 = ''.join(label).replace('\n','').replace(' ','')
print(title1,info1, label1,price)
若爬取100页出现人机认证
- 则页面手动过掉验证码并带上cookie
- 或爬取的速度放慢:
timesleep(2)
3万+

被折叠的 条评论
为什么被折叠?



