本篇文章介绍爬虫爬取某租房信息数据,数据仅用于学习使用无商业用途。
首先在Python Console 控制台中安装requests、parsel模块,requests发送网络请求获取数据,parsel用于对数据源进行解析。
pip install requests
pip install parsel
下面开始实操代码:
import requests
import parsel
# file = open("C:\\Users\\AUSU\\Desktop\\租房数据.txt", "a")
# for i in range(98):
# url = "https://hz.lianjia.com/zufang/pg" + str(i + 2) + "rt200600000002/#contentList"
url = "https://nj.lianjia.com/zufang/pg3/#contentList"
header = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36"
}
response = requests.get(url=url, headers=header)
selector = parsel.Selector(response.text)
lis = selector.css(".content__list--item--main ")
for li in lis:
title = li.css(".content__list--item--title a::text").getall()
if title:
info = str(title).replace("\\n", "").replace(" ", "").replace("[", "").replace("'", "").replace("]", "")
location: list = li.css(".content__list--item--des a::text").getall()
if location:
area = str("-".join(location))
address: list = li.css(".content__list--item--des ::text").getall()
if address:
addressInfo = str(address).replace("\\n", "").replace(" ", "").replace("[", "").replace("]", "") \
.replace("'-'", "").replace("'", "").replace(",", "")
price = li.css(".content__list--item-price em::text").get()
result = info + "|" + area + "|" + addressInfo + "|" + price + "元"
# file.write(result)
# file.write("\n")
print(result)