抓取链家网某地区的房租

import urllib,requests
from urllib import request
from bs4 import BeautifulSoup as bs
import time
import re
import csv
import numpy as np
import pandas as pd
import time

def all_info(url,startpage,endpage):
	all_house_url = []

	# 抓取所有的主页URL
	for page in range(startpage,endpage):
		# 构造翻页的url
		page_url = url+'pg'+str(page)+'/#contentList'
		# 把该页的URL传给Spider_main进行抓取
		url_list = spider_main(page_url)
		all_house_url.append(url_list)

	# 得到每一页的URL中的房屋链接,下面可以抓取相应的信息
	# print(all_house_url)
	# 开始抓取每一页的房屋数据,用all_houst_detail_list存储每个发那个屋的信息
	all_house_detail_list = []
	for i in range(len(all_house_url)):
		print("第"+str(i+1)+"页房屋数据")
		url_page = all_house_url[i]
		
		# 进入每一页的房屋信息主页进行抓取
		for page in url_page:
			# 构造房屋的每一页url
			each_house_url = 'https://sh.lianjia.com'+page+'?nav=0'
			# print(each_house_url)
			# 将url传入到抓取房屋信息的house_details函数
			detail_list = house_details(each_house_url)
			if detail_list==None:
				continue

			all_house_detail_list.append(detail_list)
			time.sleep(4)

		print("完成第"+str(i+1)+"页房屋数据")
		time.sleep(18)
	# print(all_house_detail_list)
	return all_house_detail_list


# 构造配套设施的列表
# 有的配套设施列表
is_equip = ['<i style="background-image: url(https://image1.ljcdn.com/rent-front-image/03401a4eddd82179ae3774b43e382238.1524906085686_abc8a9ce-3748-4317-9955-2452322f07d9);"></i>',
 			'<i style="background-image: url(https://image1.ljcdn.com/rent-front-image/f01e63a2d0b36d2b6b92269dac7210a8.1524905973505_6a9e4bde-4acb-4699-ba93-32f4dc13304a);"></i>', 
 			'<i style="background-image: url(https://image1.ljcdn.com/rent-front-image/b45b25b8cbdbcbf1393999d1140d6729.1524906592660_dfa64012-e42c-4b11-a874-e2888e6dce4c);"></i>', 
 			'<i style="background-image: url(https://image1.ljcdn.com/rent-front-image/2c5080db6cb434413d39fe816faddafe.1524906138308_77f21b82-5983-4448-8348-ef9346263338);"></i>', 
 			'<i style="background-image: url(https://image1.ljcdn.com/rent-front-image/82e5b44b21844b608071ac426a5eb7e6.1524906411157_ae925a22-d95e-48bf-975c-447a27dd4ce9);"></i>', 
 			'<i style="background-image: url(https://image1.ljcdn.com/rent-front-image/c40aee40a80ebcaa8d716a2c9ae14391.1524906024762_ac4fb64e-8467-46de-b6f5-7f9ba1ce2622);"></i>',
 			'<i style="background-image: url(https://image1.ljcdn.com/rent-front-image/4c7c1728139585a142553edd47ecf2cd.1525926713820_83d52079-9922-41af-af95-45f889eb5c00);"></i>', 
 			'<i style="background-image: url(https://image1.ljcdn.com/rent-front-image/b2abaa59759a7f4ae327ed67c6fbc6d8.1524906246773_6b435b4a-03d6-4292-acd8-6d3af96a791d);"></i>',
  			'<i style="background-image: url(https://image1.ljcdn.com/rent-front-image/b024f9fdd5797563ead74f237105fd5a.1524906626107_4b1c45fe-0266-40af-b39c-6311887b0aaa);"></i>', 
  			'<i style="background-image: url(https://image1.ljcdn.com/rent-front-image/aa2df480d8496d0851febe38022b1da2.1524906515169_c731df5b-234f-4716-ba42-0058f833204c);"></i>'
]

def house_details(url):
	response = request.urlopen(url)
	re_html = response.read().decode('utf-8')
	# print(re_html)
	re_tree = bs(re_html,'html.parser')
	# print(re_tree)

	# title_info = re_tree.find(class_ = re.compile("content__title"))
	# 包含了小区名字,户型,朝向
	title_info = re_tree.find(class_ = "content__title")
	if title_info ==None:
		return
	title_info = re_tree.find(class_ = "content__title").text.split(' ')
	if len(title_info)<3:
		return
	# print(title_info)
	# 小区地址信息
	estate_info = title_info[0]
	if estate_info.startswith('整租'):
		estate_info = estate_info[3:]
	# 户型信息
	shape_info = title_info[1]
	# 朝向信息
	toward_info = title_info[2]
	# 月租金信息
	rent_info = re_tree.find(class_ = "content__aside--title").text

	# 房屋面积信息
	area_info = re_tree.find_all(class_ = "content__article__table")[0]
	area_info = area_info.find_all('span')[2].text[:-1]

	#房屋信息
	base_house = re_tree.find(class_ = "content__article__info")
	# print(base_house)
	base_house_list = base_house.find_all('li')
	base_house_list_info = [x.text for x in base_house_list]
	none_list = base_house_list_info[::3]
	# print(none_list)
	base_list = []
	for i in base_house_list_info:
		if i not in none_list:
			base_list.append(i)
	# print(base_list)

	# 发布日期
	release_date_info = base_list[0][3:]
	# 租期
	rent_period_info = base_list[2][3:]
	# 楼层
	layer_info = base_list[4][3:]
	# 电梯
	lift_info = base_list[5][3:]
	# 停车位
	parking_info = base_list[6][3:]
	# 用水
	water_info = base_list[7][3:]
	# 用电
	eletricity_info = base_list[8][3:]
	

	# 交通信息
	traffic_info = re_tree.find(class_ = 'content__article__info4').find_all('li')
	traffic_list = [x.text for x in traffic_info]
	traffic_dis = []
	if len(traffic_list) != 0:
		for i in range(len(traffic_list)):
			traffic_dis.append(int(traffic_list[i].split()[-1][:-1]))
		# 交通地铁线的数量
		subway_line_info = len(traffic_list)
		# print(subway_line_info)
		# 距离最近的地铁线的距离
		subway_distance_info = min(traffic_dis)
		# print(subway_distance_info)
	else:
		subway_line_info = 0
		subway_distance_info = 0

	# 地址和区域 
	address_list = re_tree.find(class_ = 'bread__nav__wrapper oneline').text.split()

	# print(address_list)
	address_info = address_list[2][:-2]
	district_info = address_list[4][:-2]
	# print(address_info)

	# 配套设施信息
	equipment = re_tree.find(class_ = 'content__article__info2')
	# print(type(equipment))
	# print(equipment.find_all('i'))
	equip_list  = []
	for line in equipment.find_all('i'):
		if str(line) in is_equip:
			equip_list.append(1)
		else:
			equip_list.append(0)
	
	# 是否有电视
	tv_info = equip_list[0]
	# 是否有冰箱
	fridge_info = equip_list[1]
	# 是否有洗衣机
	washer_info = equip_list[2]
	# 空调
	air_info = equip_list[3]
	# 热水器
	heater_info = equip_list[4]
	# 床
	bed_info = equip_list[5]
	# 暖气
	heating_info = equip_list[6]
	# 宽带
	wifi_info = equip_list[7]
	# 衣柜
	closet_info = equip_list[8]
	# 天然气
	gas_info = equip_list[9]
	
	# 图片链接
	pic = re_tree.find_all(class_ = 'content__article__slide__item')
	# print(pic)
	# print(pic[0] )

	pattern = r'img .*? src="(.+?\.jpg)"'
	complie_re = re.compile(pattern)
	imgList = complie_re.findall(re_html)
	img_info = imgList[0]
	time_info  = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
	return [address_info,district_info,estate_info,area_info,rent_info,layer_info,shape_info,toward_info,release_date_info,rent_period_info,lift_info,parking_info,water_info,eletricity_info,
			subway_line_info,subway_distance_info,tv_info,fridge_info,washer_info,air_info,heater_info,bed_info,heating_info,wifi_info,closet_info,gas_info,img_info,time_info]



# 返回主页的URL
def spider_main(url):
	# 伪造头
	# response = requests.get(url,{'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'})
	response = request.urlopen(url)
	re_html = response.read().decode('utf-8')
	# print(re_html)

	re_tree = bs(re_html,'html')
	# 抓取每一页的房屋信息URL
	house_url = re_tree.find_all(href=re.compile("^/zufang/SH"))
	# 将所有的房屋信息URL放入到一个list中
	house_list = []
	for i in range(len(house_url)):
		house_list.append(house_url[i]['href'])
	
	return list(set(house_list))
	# response = urllib.request.urlopen(url)
	# re_content  = response.read().decode('utf-8')
	# re_tree  = bs(re_content,'lxml')
	# print(re_tree)

def writer_csv(all_list,filename):
	with open(filename,"w") as csvfile:
		writer = csv.writer(csvfile)
		writer.writerow(["Address","District","Community","Area","RentPrice","Layer","Shape","Toward","Release_Date","Rent_Period","Is_lift","Is_parking","water_way","electricity_way","Subwayline_Number","Subway_Distance",
			'Is_TV','Is_fridge','Is_washer','Is_air','Is_heater','Is_bed','Is_heating','Is_wifi','Is_closet','Is_gas','Img_link'])
		writer.writerow(all_list)

if __name__ == '__main__':
	url = 'https://sh.lianjia.com/zufang/'
	# page_list = [[1,10],[10,12],[12,15],[15,17],[17,20]]
	# page_list=[[20,25],[25,30]]
	# page_list = [[10,16],[16,22],[22,28],[28,34],[34,40],[40,46],[46,51]]
	# page_list = [[51,56],[56,60],[60,65],[65,70],[75,80],[80,85],[85,90]]
	page_list = [[70,75],[80,85],[85,90]]
	name = ["Address","District","Community","Area","RentPrice","Layer","Shape","Toward","Release_Date","Rent_Period","Is_lift","Is_parking","water_way","electricity_way","Subwayline_Number","Subway_Distance",
			'Is_TV','Is_fridge','Is_washer','Is_air','Is_heater','Is_bed','Is_heating','Is_wifi','Is_closet','Is_gas','Is_img','Catch_time']
	for page in page_list:
		all_house_list = all_info(url,page[0],page[1])
		all_house_info = pd.DataFrame(columns = name,data=all_house_list)
		filename = 'F:/个人发展/机器学习/房价信息/houseinfo'+str(page[0])+'.csv'
		all_house_info.to_csv(filename,encoding = 'gbk',index = False)
	# all_house_list = np.array(all_house_list).T
	# writer_csv(all_house_list,'house_info.csv')

先放代码,19年7月的时候做的一个小项目,这是数据采集部分,之后有个房价因素数据挖掘的过程。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值