爬取房天下(全站)

# -*- coding: utf-8 -*-
# @Time    : 2017/9/5 10:58
# @Author  : z
# @File    : 房天下.py
# @Software: PyCharm

import requests
from bs4 import BeautifulSoup
import re
import threadpool

class House(object):
    def __init__(self):
        pass

    def get_url(self,url):
        response = requests.get(url)
        response.encoding = response.apparent_encoding
        self.html = response.text
    #解析数据,拼接初始url
    def join_url(self):
        # url = 'http://js.soufunimg.com/homepage/new/family/css/citys.js?v=20170520'
        # response=eval(requests.get(url).text[14:-1])
        # for ur in response:
        #     self.gain_num(ur['name'],'http://newhouse.'+ur['url'].split('.')[0].split('//')[1]+'.fang.com/house/s/')
        self.gain_num('上海', 'http://newhouse.sh.fang.com/house/s/')
    #获取最大数。循环出新url
    def gain_num(self, city, city_url):
        self.get_url(city_url)
        max_num =re.findall('\d+',re.findall(re.compile('<span class="ff3333">.*?</span>/(.*?)</span>', re.S), self.html)[0])[0]
        for num in range(1,int(max_num)+1):
            # print('++++++++++++++++++++++++++++++++++++++{}++++++++++++++++++++++++++++++++++++++++++++++'.format(num))
            url = city_url+'b9'+str(num)
            self.parse_page(city,url,num)
        #xianccccccccccc
        # list = [city_url + 'b9' + str(num) for num in range(1, int(1) + 1)
        # # url = city_url+'b9'+str(num)
        #
        # thind = threadpool.ThreadPool
        # t = threadpool.makeRequests(self.parse_page, (city, list))

    #解析首页,获取城市名,
    def parse_page(self,city,city_url,num):
        self.get_url(city_url)
        soup = BeautifulSoup(self.html,'lxml')
        for con in soup.find('div',id='newhouse_loupai_list').find('ul').find_all('li'):
            house =con.find('div','nlc_details').find('div','nlcd_name')
            price = con.find('div', 'nlc_details').find('div', 'nhouse_price')

            district_name =con.find('div','nlc_details').find('div','address').get_text().strip().split(']')[0].replace('\t','').replace('[','')
            house_url = house.a['href']
            project_name= house.a.get_text().strip()
            if price:
                project_price=price.find('span').get_text()
            self.parse_detail(house_url,city,district_name,project_name,project_price,num)
    def parse_detail(self,city_url,city_name,district_name,project_name,project_price,num):
        self.get_url(city_url)
        detail = BeautifulSoup(self.html,'lxml').find('div',id='orginalNaviBox').find_all('a')[1]['href']
        self.get_url(detail)
        soup = BeautifulSoup(self.html,'lxml')
        #评分
        # grade = soup.find('div','main-info-comment').find_all('span')[2].get_text()
        # #点评数
        # num = soup.find('div','main-info-comment').find_all('span')[3].get_text()[1:-1]
        content = soup.find_all('div', 'main-item')
        #基本信息
        print('------------------------------------1-----------------------------------------------')
        print('***********************')
        print('第{}页'.format(num))
        print(city_name)
        print(project_name)
        print(district_name)
        print(project_price)
        print('***********************')


        clearfix=content[0].find('ul','list clearfix').find_all('li')
        project_type =clearfix[0].get_text().strip().replace('\n','').replace('\t','')
        project_point =','.join([te.get_text() for te in clearfix[1].find('div','list-right').find_all('span','tag')])
        project_buildform =clearfix[2].get_text().replace('\n','').replace('\t','')
        project_fixture =clearfix[3].get_text().strip().replace('\n','').replace('\t','')
        project_ownlife =clearfix[4].get_text().strip().replace('\n','').replace('\t','')
        line_name =clearfix[5].get_text().strip().replace('\n','').replace('\t','')
        developer_name =clearfix[6].get_text().strip().replace('\n','').replace('\t','')
        project_address =clearfix[7].get_text().strip().replace('\n','').replace('\t','')
        print(project_type)
        print('项目特色:'+project_point)
        print(project_buildform)
        print(project_fixture)
        print(project_ownlife)
        print(line_name)
        print(developer_name)
        print(project_address)

        print('')
        #销售信息
        list1 =content[1].find('ul','list clearfix').find_all('li')
        table = content[1].find('div','main-table')
        project_salestatu =list1[0].get_text().replace('\n','').replace('\t','').strip()
        project_discount =list1[1].get_text().replace('\n','').replace('\t','').strip()
        project_opendate=list1[2].get_text().replace('\n','').replace('\t','').strip()
        project_getdate=list1[3].get_text().replace('\n','').replace('\t','').strip()
        project_office=list1[4].get_text().replace('\n','').replace('\t','').strip()
        project_roomclass=list1[6].get_text().replace('\n','').replace('\t','').strip()
        print(project_salestatu)
        print(project_discount)
        print(project_opendate)
        print(project_getdate)
        print(project_roomclass)
        # print('-------------1-------------------')
        if table:
            for i in table.find_all('table'):
                for td in i.find_all('tr')[1:]:
                    presale_name1 = td.find_all('td')[0].get_text()
                    presale_date1 = td.find_all('td')[1].get_text()
                    presale_build = td.find_all('td')[2].get_text()
                    print('预售许可证: '+presale_name1)
                    print('发证时间: '+presale_date1)
                    print('绑定楼栋: '+presale_build)

        print(' ')
        #配套信息
        try:
            project_traffic = content[2].find('div','set').get_text().replace('\n','').replace('\br','').replace(' ','').strip()
            project_support = content[2].find('div','set bd-1').find('p').get_text().replace('\n','').replace('\br','').replace('\t','').strip()
            print('交通配套:' + project_traffic)
            print('项目配套:' + project_support)
        except:
            pass

        print(' ')

        #小区规划
        project = content[3].find('ul','clearfix list').find_all('li')
        project_allarea = project[0].get_text().replace('\n','').replace('\t','').strip()
        project_buildarea = project[1].get_text().replace('\n','').replace('\t','').strip()
        project_volume = project[2].get_text().replace('\n','').replace('\t','').strip()
        project_green = project[3].get_text().replace('\n','').replace('\t','').strip()
        project_carpark = project[4].get_text().replace('\n','').replace('\t','').strip()
        build_number = project[5].get_text().replace('\n','').replace('\t','').strip()
        project_household = project[6].get_text().replace('\n','').replace('\t','').strip()
        manager_name = project[7].get_text().replace('\n','').replace('\t','').strip()
        manager_price = project[8].get_text().replace('\n','').replace('\t','').strip()
        build_floor = project[9].get_text().replace('\n','').replace('\t','').strip()

        print(project_allarea)
        print(project_buildarea)
        print(project_volume)
        print(project_green)
        print(build_number)
        print(project_household)
        print(manager_name)
        print(manager_price)
        print(build_floor)

        print(' ')

        #价格信息
        price = content[4].find_all('table')
        for tr in price:
            for td in tr.find_all('tr')[1:]:
            #     print(td)
                price_record = td.find_all('td')[0].get_text()
                price_value = td.find_all('td')[1].get_text()
                price_low = td.find_all('td')[2].get_text()
                price_caption = td.find_all('td')[3].get_text()
                print('记录时间:'+price_record)
                print('均价:'+price_value)
                print('起价:'+price_low)
                print('价格描述:'+price_caption)

        print(' ')
        #项目简介
        project_caption = content[5].find('p','intro').get_text().strip()
        print('项目简介:'+project_caption)
        print(' ')
        print('---------------------------------------2-------------------------------------------')
        print(' ')



if __name__ == '__main__':

House().join_url()

(代码有点瑕疵,,如果有更好的解决方式,,联系我!!共同进步!!)

代码

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值