爬取楼盘信息xpath的运用

最新推荐文章于 2022-04-12 22:52:57 发布

huibif

最新推荐文章于 2022-04-12 22:52:57 发布

阅读量182

点赞数

分类专栏：学习笔记文章标签： xpath 爬虫

本文链接：https://blog.csdn.net/xlionsky/article/details/113359636

版权

学习笔记专栏收录该内容

34 篇文章 0 订阅

订阅专栏

这段代码用于从天津新房网站抓取楼盘的详细信息，包括楼盘名称、地址、价格、户型和状态，并将数据保存到CSV文件中。通过设置超时重试机制确保请求稳定性，使用lxml库解析HTML，最后将数据写入文件，便于分析。

摘要由CSDN通过智能技术生成

今天忽然紧急需要某个城市的楼盘信息.
火速搞定:

import requests
from lxml import etree
import csv
import time

def get_html(url,header,count=0,max_count = 5,code = 'gbk'):
    try:
        r = requests.get(url, header,timeout =30)
        r.raise_for_status
        r.encoding = code
        return r 
    except:
        count += 1
        print('第{}次访问超时{}'.format(count,url))
        time.sleep(5)
        if count <= max_count:
            return get_html(url,header,count=count)

def get_loupan():
    title = ['楼盘名称', '地址', '价格', '户型', '标签']
    with open('天津新房1.csv', 'w', newline='',encoding='utf-8') as f:
        writer = csv.writer(f, title)
        writer.writerow(title)

    totalPage = 36

    for i in range(1, totalPage+1):

        header = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0',
                'Refrer': 'https://tj.newhouse.fang.com/house/s/b9{}/',
        }
        url = "https://tj.newhouse.fang.com/house/s/b9{}/".format(str(i))
        r = get_html(url, header)

        xml = etree.HTML(r.text)

        name = xml.xpath('//div[@class="nlc_details"]//div[@class="nlcd_name"]/a/text()')
        for index in range(len(name)):
            name[index]=name[index].strip().replace('\t','').replace('\n','').replace('\r','')
        # print(name)

        house_type = xml.xpath('//div[@class="nlc_details"]/div[@class="house_type clearfix"]')
        for index in range(len(house_type)):
            house_type[index] = house_type[index].xpath("string(.)").strip().replace('\t','').replace('\n','').replace('\r','')
        # print(house_type)

        address = xml.xpath('//div[@class="nlc_details"]//div[@class="address"]')
        for index in range(len(address)):
            address[index] = address[index].xpath("string(.)").strip().replace('\t','').replace('\n','').replace('\r','')
        # print(address)

        price = xml.xpath('.//div[@class="nlc_details"]//div[@class="nhouse_price"]/span/text()')
        # print(price)

        status = xml.xpath('.//div[@class="nlc_details"]//div[@class="fangyuan"]')
        for index in range(len(status)):
            status[index]=status[index].xpath("string(.)").strip().replace('\t', '').replace('\n', '').replace('\r', '')
        # print(status)

        with open('天津新房1.csv', 'a',newline='',encoding='utf-8') as f:
            writer = csv.writer(f,title)
            writer.writerows(zip(name,address,price,house_type,status))

        print('\r完成进度为:{:.2f}%'.format(i*100/totalPage))

 
if __name__ == '__main__':
    get_loupan()