import xlwt import requests from lxml import etree from concurrent.futures import ThreadPoolExecutor # 用来保存所有信息的列表 all_dates = [] def get_all_dates(url): resp = requests.get(url).text tree = etree.HTML(resp) div_list = tree.xpath('//div[@class = "content__list"]/div') for div in div_list: date = [] # 获取房屋位置名称 name = div.xpath("./div/p[2]/a//text()") name = name[0] + '-' + name[1] + '-' + name[2] # 房屋面积 area = div.xpath("./div/p[2]/text()[5]")[0] area = area.replace('\n', '').replace(' ', '') # 房屋朝向 toward = div.xpath("./div/p[2]/text()[6]")[0] toward = toward.replace(' ', '') # 房屋户型 size = div.xpath("./div/p[2]/text()[7]")[0] size = size.replace(' ', '').replace('\n', '') # 房屋价格 prize = div.xpath("./div/span//text()") prize = prize[0] + prize[1] # 获取当前房屋新的url new_url = 'https://wh.lianjia.com' + div.xpath('./div/p[1]/a/@href')[0] new_resp = requests.get(new_url) new_tree = etree.HTML(new_resp.text) floor = new_tree.xpath('//*[@id="aside"]/ul/li[3]/span[2]/text()')[0] floor = floor.split(' ')[1] # 楼层高度类型 floor_size = floor.split('/')[0] # 楼层高度 floor_high = floor.split('/')[1] # 将信息加入到date列表 date.append(name) date.append(area) date.append(toward) date.append(size) date.append(floor_size) date.append(floor_high) date.append(prize) # 将单个房屋信息列表加入到所有信息列表 all_dates.append(date) print('正在获取!!!', date) # 线程池,加快爬取速度 with ThreadPoolExecutor(50) as p: for page in range(1, 51): url = f"https://wh.lianjia.com/zufang/pg{page}/#contentList" p.submit(get_all_dates, url) # 将数据保存到excle wookbook = xlwt.Workbook(encoding='utf-8') booksheet = wookbook.add_sheet('链家') title = ['位置', '面积', '朝向', '户型', '楼层信息', '楼层', '租金'] for i in range(len(title)): booksheet.write(0, i, title[i]) j = 1 for item in all_dates: for i in range(len(item)): booksheet.write(j, i, item[i]) j += 1 wookbook.save('链家.xls')
爬取链家租房信息
最新推荐文章于 2024-04-12 14:15:06 发布