爬取贝壳租房的数据
from csv import writer
from re import fullmatch
import os
import requests
from bs4 import BeautifulSoup
def get_url(start_page=1, end_page=2):
# 如果'file'文件不存在就创建
if not os.path.exists(r'file'):
os.mkdir(r'file')
# 打开'file'下的'house.csv'文件
f = open(r'file/house.csv', 'w', encoding='utf-8', newline='')
w1 = writer(f)
# 存入第一行
w1.writerow(['名字', '地址', '面积', '样式', '价格'])
# 设置起始页和终止页并遍历
for page in range(start_page, end_page):
url = rf'https://cd.zu.ke.com/zufang/pg{page}/#contentList'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers)
result = response.text
# 创建soup对象
soup = BeautifulSoup(result, 'lxml')
# 数据筛选
div_house = soup.select('.content__article>div>div')
for x in div_house:
# 提取名字
name = x.select_one('.twoline').text.strip()
# 提取p标签里的内容
msg = x.select('div>p')
# 对提取到的内容进行处理,获取地址add,面积area,样式pattern
msg_list = [y.text.strip().split('\n') for y in msg]
if fullmatch(r'\w+\s+/', msg_list[1][0]):
add = msg_list[1][1]
area = msg_list[1][3].strip()
pattern = msg_list[1][5].strip()
elif fullmatch(r'\w+-\w+-\w+', msg_list[1][0]):
add = msg_list[1][0]
area = msg_list[1][2].strip()
pattern = msg_list[1][4].strip()
else:
continue
# 获取价格
price = x.select_one('.content__list--item-price').text
# 将数据写入csv文件
w1.writerow([name, add, area, pattern, price])
print(f'第{page}页加载完成')
# 关闭文件
f.close()
if __name__ == '__main__':
get_url(1, 101)