from bs4 import BeautifulSoup
import requests
import time
import random
import csv
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
def get_html(url):
user_agent = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
] # 游览器头部信息
# 代理IP
proxies = [
"http://175.44.108.161:9999",
"http://191.241.34.210:8080",
"http://122.4.50.96:9999",
"http://175.42.123.222:9999",
'http://119.108.165.8:9000',
'http://183.166.111.202:9999',
'http://113.120.32.246:9999',
'http://113.120.36.25:9999',
'http://110.243.2.233:9999',
'http://123.55.106.215:9999',
'http://223.242.224.4:9999',
'http://182.32.231.5:9999',
'http://125.108.83.188:9000',
'http://123.101.64.67:9999'
]
res = requests.get(url, headers={"User-Agent": random.choice(user_agent)},
proxies={"http": random.choice(proxies)}) # 3-访问网址和添加访问游览器时的头部信息
res.encoding = 'utf-8' # 设置编码
return res
# ### 获取租房链接
def main(start, end):
for i in range(start, end):
print(f'正在爬取第{i}页链接')
# 发送请求
res = get_html(f'https://hz.zu.ke.com/zufang/pg{i}/#contentList')
soup = BeautifulSoup(res.text,'html.parser')
divs = soup.find_all(class_='content__list--item--main')
data = []
# 获取多个链接并保存
for each in divs:
href = 'https://hz.zu.ke.com'+each.find('a').attrs['href']
name = each.find('a').text
information = each.find(class_='content__list--item--des').text
data.append([i,href,name,information])
DataFrame(data).to_csv('租房链接/hrefs.csv',mode='a',index=False,header=False)
print(f'第{i}页保存成功')
time.sleep(random.randint(2,5))
# ### 爬取每个租房信息
def get_information(start, end):
# 读取文件,获取链接信息
hrefs = pd.read_csv('租房链接/hrefs.csv',header=None)
for i in range(start, end):
data = []
print(f'正在爬取第{i}页')
res = get_html(hrefs[0][i])
soup = BeautifulSoup(res.text, 'html.parser')
try:
# 获取租房标题
data.append(soup.find(class_='content__title').text)
except:
print('没有找到该网页')
continue
# 获取租赁方式
data.append(soup.find(class_='content__aside__list').find_all('li')[0].text)
# 房屋类型
data.append(soup.find(class_='content__aside__list').find_all('li')[1].text)
lis = soup.find(class_='content__article__info').find_all('li',attrs={'class':"fl oneline"})
# 获取朝向 ========
data.append(lis[2].text)
# 楼层
data.append(lis[7].text)
# 电梯
data.append(lis[8].text)
# 车位
data.append(lis[10].text)
# 用水
data.append(lis[11].text)
# 用电
data.append(lis[13].text)
# 燃气
data.append(lis[14].text)
# 采暖
data.append(lis[16].text)
# 租期
data.append(lis[18].text)
# 配套设施===================================================
lis2 = soup.find(class_='content__article__info2').find_all('li')[1:]
str1 = 'facility_no'
text = ''
for each in lis2:
if str1 in each.attrs['class']:
text += '无'+each.text+'/'
else:
text += '有'+each.text+'/'
data.append(text)
# # 租押金=========================================
lis3 = soup.find_all(class_='table_col')
# 付款方式
data.append(lis3[0].text+':'+lis3[5].text)
# 租金
data.append(lis3[1].text+':'+lis3[6].text)
# 押金
data.append(lis3[2].text+':'+lis3[7].text)
# 服务费
data.append(lis3[3].text+':'+lis3[8].text)
# 中介费
data.append(lis3[4].text+':'+lis3[9].text)
# ==============================================================保存
DataFrame([data]).to_csv('租房链接/租房信息.csv',mode='a',index=False,header=False)
time.sleep(random.randint(2,5))
print(len(data))
print(f'第{i}页保存完成')
# import threading
# 使用多线程
# t1 = threading.Thread(target=get_information,args=(151,500))
# t2 = threading.Thread(target=get_information,args=(654,1000))
# t3 = threading.Thread(target=get_information,args=(1151,1500))
# t4 = threading.Thread(target=get_information,args=(2153,2500))
# t1.start()
# t2.start()
# t3.start()
# t4.start()
# 使用函数获取链接并保存 输入从第几页开始爬到第几页结束
# main(1,100)
# 打开刚获取的所有链接,然后获取信息,输入从第几个链接开始到第几页结束
get_information(1, 500)
爬取贝壳所有杭州租房信息
最新推荐文章于 2025-03-09 22:13:06 发布