import requests
import xlrd as xlrd
from xlutils.copy import copy
from lxml import etree
def getData():
b = '枫丹丽城','锦泉源','金色阳光家园','奥林园','美域盛景','富士庄园','润泽园','骏腾名苑','泉水友好园','泉水人家幸福里','泉水家年华','龙畔金泉三期','龙畔金泉二期','龙畔金泉一期','龙畔金泉四期','龙畔金泉五期K1区','泉水N3区','泉水N1区','泉水N2区','龙畔金泉五期'
for b_1 in b:
for i in range(1,21):
url = f'https://dl.ke.com/chengjiao/pg{i}rs{b_1}/'
h = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.9 Safari/537.36',
'Cookie': ''需要带上自己的cookie}
res = requests.get(url,headers=h)
xp = etree.HTML(res.text)
list_l = xp.xpath('//ul[@class="listContent"]/li')
datalist=[]
for na in list_l:
title = na.xpath('.//div[@class="title"]/a/text()')[0]#.replace(' ','')
houseInfo = na.xpath('.//div[@class="houseInfo"]/text()')[1].strip()#.replace(' ','')
dealDate = na.xpath('.//div[@class="dealDate"]/text()')[0].strip()
totalPrice = na.xpath('.//div[@class="totalPrice"]/span/text()')[0]+'万'
positionInfo = na.xpath('.//div[@class="positionInfo"]/text()')[1].strip()#.replace(' ','')
unitPrice = na.xpath('.//div[@class="unitPrice"]/span/text()')[0]+'元/平'
dealCycleTxt = na.xpath('.//span[@class="dealCycleTxt"]/span/text()')[0]
page = [title,houseInfo,dealDate,totalPrice,positionInfo,unitPrice,dealCycleTxt]
datalist.append(page)
index = len(datalist)
workbook = xlrd.open_workbook('贝壳成交.xls') # 打开工作簿
sheets = workbook.sheet_names() # 获取工作簿中的所有表格
worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
rows_old = worksheet.nrows # 获取表格中已存在的数据的行数
new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象
new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格
for i in range(0, index):
for j in range(0, len(datalist[i])):
new_worksheet.write(i + rows_old, j, datalist[i][j]) # 追加写入数据,注意是从i+rows_old行开始写入
new_workbook.save('贝壳成交.xls') # 保存工作簿
print("xls格式表格写入数据成功!")
if __name__ == '__main__':
getData()
Python爬虫实战:爬取贝壳网二手房成交数据,将数据存入Excel。
最新推荐文章于 2024-08-16 23:27:41 发布