今天忽然紧急需要某个城市的楼盘信息.
火速搞定:
import requests
from lxml import etree
import csv
import time
def get_html(url,header,count=0,max_count = 5,code = 'gbk'):
try:
r = requests.get(url, header,timeout =30)
r.raise_for_status
r.encoding = code
return r
except:
count += 1
print('第{}次访问超时{}'.format(count,url))
time.sleep(5)
if count <= max_count:
return get_html(url,header,count=count)
def get_loupan():
title = ['楼盘名称', '地址', '价格', '户型', '标签']
with open('天津新房1.csv', 'w', newline='',encoding='utf-8') as f:
writer = csv.writer(f, title)
writer.writerow(title)
totalPage = 36
for i in range(1, totalPage+1):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0',
'Refrer': 'https://tj.newhouse.fang.com/house/s/b9{}/',
}
url = "https://tj.newhouse.fang.com/house/s/b9{}/".format(str(i))
r = get_html(url, header)
xml = etree.HTML(r.text)
name = xml.xpath('//div[@class="nlc_details"]//div[@class="nlcd_name"]/a/text()')
for index in range(len(name)):
name[index]=name[index].strip().replace('\t','').replace('\n','').replace('\r','')
# print(name)
house_type = xml.xpath('//div[@class="nlc_details"]/div[@class="house_type clearfix"]')
for index in range(len(house_type)):
house_type[index] = house_type[index].xpath("string(.)").strip().replace('\t','').replace('\n','').replace('\r','')
# print(house_type)
address = xml.xpath('//div[@class="nlc_details"]//div[@class="address"]')
for index in range(len(address)):
address[index] = address[index].xpath("string(.)").strip().replace('\t','').replace('\n','').replace('\r','')
# print(address)
price = xml.xpath('.//div[@class="nlc_details"]//div[@class="nhouse_price"]/span/text()')
# print(price)
status = xml.xpath('.//div[@class="nlc_details"]//div[@class="fangyuan"]')
for index in range(len(status)):
status[index]=status[index].xpath("string(.)").strip().replace('\t', '').replace('\n', '').replace('\r', '')
# print(status)
with open('天津新房1.csv', 'a',newline='',encoding='utf-8') as f:
writer = csv.writer(f,title)
writer.writerows(zip(name,address,price,house_type,status))
print('\r完成进度为:{:.2f}%'.format(i*100/totalPage))
if __name__ == '__main__':
get_loupan()