爬虫小程序（一）

最新推荐文章于 2024-07-26 22:40:18 发布

弱小的皮卡龙

最新推荐文章于 2024-07-26 22:40:18 发布

阅读量2.9k

点赞数 2

分类专栏：爬虫小程序文章标签：爬虫房地产项目 requests

本文链接：https://blog.csdn.net/tgj2094942564/article/details/100140406

版权

爬虫小程序专栏收录该内容

5 篇文章 2 订阅

订阅专栏

今天闲来无事帮同学抓取了一个房地产项目的数据网站

在这里插入图片描述

就是这样一个页面

当你输入关键字的时候，在network你会发现这样一个
在这里插入图片描述
继续往下翻

看看preview

有趣啊
实际操作一波
（实际上我当时写代码的时候发现 XHR里面根本就没有数据传送过来，所以我果断用了selenium，弱智了弱智了）
简单的发送一个请求

可以看到我们所需要的数据都在里面（有时候就是这样，找不到接口的时候就苦逼的很），后面就是提取我们所需要的数据了

我还是放一下之前的代码吧，其实还行

from selenium import webdriver

from selenium.webdriver.support.select import Select

from selenium.webdriver.support.wait import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.chrome.options import Options

import time,requests

from lxml import etree

import csv,re

import os

def get_building_info(name, urls):

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
    }

    name = name.strip()

    # os.mkdir(name)

    num = 1

    for url in urls:

        response = requests.get(url, headers=headers).text

        html = etree.HTML(response)

        trs = html.xpath('//table[@class="dataintable"]//tr[position()>1]')

        with open(name + '楼盘' + str(num) + '.csv', 'w', newline='') as file:

            csvfile = csv.writer(file)

            for tr in trs:

                info = []

                tds = tr.xpath('./td[position()<7]')

                for td in tds:

                    text = td.xpath('./text()')[0]

                    info.append(text)

                csvfile.writerow(info)

        num += 1




# get_building_info('tgj123','http://tp.tangshan.gov.cn:8090/wsysbudinghouse.jspx?item_code=00001492&build_code=0207')

def get_item_info(item, urls1, urls2):

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
    }
    informations = []

    for url in urls1:

        info = []

        response = requests.get(url, headers = headers).text

        html = etree.HTML(response)

        trs = html.xpath('//table[@class="dataintable"]//tr')


        for tr in trs:

            tds = tr.xpath('./td[position()=((position() mod 2)=0)]')

            for td in tds:

                text = td.xpath('./text()')

                info.append(text[0])

        informations.append(info)

    ppx = []

    loupan_urls = []

    for url in urls2:

        ins = []

        response = requests.get(url, headers = headers).text

        html = etree.HTML(response)

        tds = html.xpath('//table[@class="dataintable"]//tr[last()]/td[position()<7]')

        base_url = 'http://tp.tangshan.gov.cn:8090' + html.xpath('//table[@class="dataintable"]//tr[last()]/td[last()]/a/@href')[0]

        loupan_urls.append(base_url)

        get_building_info(item, loupan_urls)

        for td in tds:

            text = td.xpath('./text()')[0]

            ins.append(text)

        ppx.append(ins)

    informations = list(map(lambda info1, info2:info1 + info2, informations, ppx))

    # informations = list(map(lambda info1, loupan:info1 + loupan, informations, loupan_urls))

    return informations


chrome_options = Options()

chrome_options.add_argument('--headless')

driver = webdriver.Chrome(chrome_options = chrome_options, executable_path='C:\\Program Files (x86)\\Google\\Chrome\\Application\\CHROME\\chromedriver.exe')

driver.get('http://tp.tangshan.gov.cn:8090/wsyscx.jspx')

time.sleep(1)

# driver.switch_to.frame(0)

se = driver.find_element_by_id('type')

key = input('请输入预售查询的类型，为 预售证 还是 项目名称(如果你输入的不是预售证，则默认为后者):  ')

if key == '预售证':

    Select(se).select_by_index(1)

    yushouzheng = input('请输入预售证：')

    driver.find_element_by_name('typeval').send_keys(yushouzheng)

    time.sleep(1)

    search = driver.find_element_by_xpath('//*[@id="container"]/div/div/form/table/tbody/tr[2]/td[3]/input').click()

    html = etree.HTML(driver.page_source)

    urls1 = html.xpath('//table[@class="dataintable"]//tr[position()>1]/td[5]/a/@href')

    urls2 = html.xpath('//table//tr[position()>1]/td[last()]/a/@href')

    if urls1 == None:
        driver.close()

    urls1 = list(map(lambda url: 'http://tp.tangshan.gov.cn:8090' + url, urls1))

    urls2 = list(map(lambda url: 'http://tp.tangshan.gov.cn:8090' + url, urls2))

    informations = get_item_info(urls1, urls2)

    item = item.strip()

    with open(yushouzheng + '.csv', 'w', newline='') as file:

        headers = ['预售证号', '售房单位', '项目名称', '预售总面积', '房屋坐落位置', '	预售套数', '发证日期', '预售范围', '发证机关', '预售对象', '栋号', '预售许可证',
                   '总套数', '总面积', '总层数', '状态', '楼盘']

        csvfile = csv.writer(file)

        csvfile.writerow(headers)

        for city in informations:
            csvfile.writerow(city)

else:

    Select(se).select_by_index(1)

    item = input('请输入项目名称：')

    driver.find_element_by_name('typeval').send_keys(item)

    time.sleep(1)

    search  = driver.find_element_by_xpath('//*[@id="container"]/div/div/form/table/tbody/tr[2]/td[3]/input').click()

    html = etree.HTML(driver.page_source)

    urls1 = html.xpath('//table[@class="dataintable"]//tr[position()>1]/td[5]/a/@href')

    urls2 = html.xpath('//table//tr[position()>1]/td[last()]/a/@href')

    if urls1 == None:

        driver.close()

    urls1 = list(map(lambda url:'http://tp.tangshan.gov.cn:8090' + url, urls1))

    urls2 = list(map(lambda url:'http://tp.tangshan.gov.cn:8090' + url, urls2))

    informations = get_item_info(item, urls1, urls2)

    item = item.strip()

    with open(item + '.csv', 'w', newline='') as file:

        headers = ['预售证号','售房单位','项目名称','预售总面积','房屋坐落位置','	预售套数','发证日期','预售范围','发证机关','预售对象','栋号','预售许可证','总套数','总面积','总层数','状态','楼盘']

        csvfile = csv.writer(file)

        csvfile.writerow(headers)

        for city in informations:

            csvfile.writerow(city)