爬取楼盘信息xpath的运用

今天忽然紧急需要某个城市的楼盘信息.
火速搞定:

import requests
from lxml import etree
import csv
import time

def get_html(url,header,count=0,max_count = 5,code = 'gbk'):
    try:
        r = requests.get(url, header,timeout =30)
        r.raise_for_status
        r.encoding = code
        return r 
    except:
        count += 1
        print('第{}次访问超时{}'.format(count,url))
        time.sleep(5)
        if count <= max_count:
            return get_html(url,header,count=count)

def get_loupan():
    title = ['楼盘名称', '地址', '价格', '户型', '标签']
    with open('天津新房1.csv', 'w', newline='',encoding='utf-8') as f:
        writer = csv.writer(f, title)
        writer.writerow(title)

    totalPage = 36

    for i in range(1, totalPage+1):

        header = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0',
                'Refrer': 'https://tj.newhouse.fang.com/house/s/b9{}/',
        }
        url = "https://tj.newhouse.fang.com/house/s/b9{}/".format(str(i))
        r = get_html(url, header)

        xml = etree.HTML(r.text)

        name = xml.xpath('//div[@class="nlc_details"]//div[@class="nlcd_name"]/a/text()')
        for index in range(len(name)):
            name[index]=name[index].strip().replace('\t','').replace('\n','').replace('\r','')
        # print(name)

        house_type = xml.xpath('//div[@class="nlc_details"]/div[@class="house_type clearfix"]')
        for index in range(len(house_type)):
            house_type[index] = house_type[index].xpath("string(.)").strip().replace('\t','').replace('\n','').replace('\r','')
        # print(house_type)

        address = xml.xpath('//div[@class="nlc_details"]//div[@class="address"]')
        for index in range(len(address)):
            address[index] = address[index].xpath("string(.)").strip().replace('\t','').replace('\n','').replace('\r','')
        # print(address)

        price = xml.xpath('.//div[@class="nlc_details"]//div[@class="nhouse_price"]/span/text()')
        # print(price)

        status = xml.xpath('.//div[@class="nlc_details"]//div[@class="fangyuan"]')
        for index in range(len(status)):
            status[index]=status[index].xpath("string(.)").strip().replace('\t', '').replace('\n', '').replace('\r', '')
        # print(status)

        with open('天津新房1.csv', 'a',newline='',encoding='utf-8') as f:
            writer = csv.writer(f,title)
            writer.writerows(zip(name,address,price,house_type,status))

        print('\r完成进度为:{:.2f}%'.format(i*100/totalPage))

 
if __name__ == '__main__':
    get_loupan()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值