python爬取安居客(BeautifulSoup先找大盒子,在大盒子里找各个小盒子)

# -*- coding: utf-8 -*-
"""
Created on Sat Jan 20 22:16:40 2018

@author: Administrator
"""

import requests
from bs4 import BeautifulSoup
import time
import re
from multiprocessing import Pool

headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
        }
total=[]
def get_loupan(url):
    res=requests.get(url,headers=headers)
    soup=BeautifulSoup(res.text,'html.parser')
    fang_all=soup.find_all('div',rel='nofollow')#每个房子所有信息标签
    for fang_each in fang_all:
        title=fang_each.find('span',class_='items-name').text.strip()
        jiage=fang_each.find('p').text.strip()
        dizhi=fang_each.find('span',class_='list-map').text.strip()
        mianji=fang_each.find('a',class_='huxing')
        if mianji == None:
            mianji_1 = '尚未公开'#none值不能text
        else:
            mianji_1 = ''.join(re.split(r'\s+',mianji.text))
        pinglun=fang_each.find('span',class_='list-dp')
        '''
        if pinglun == None:
            pinglun_1 = ''
        else:
            pinglun_1 = pinglun.text
        '''
        pinglun_1 = '' if pinglun == None else pinglun.text    
        info={'标题':title,
              '价格':jiage,
              '地址':dizhi,
              '面积':mianji_1,
              '评论':pinglun_1}
        total.append(info)
    return total
if __name__ == '__main__':
    urls=['https://sh.fang.anjuke.com/loupan/all/p{}/'.format(i) for i in range(1,25)]
    start_1=time.time()
    for url in urls:
        get_loupan(url)
    end_1 = time.time()
    print('串行爬虫:',end_1 - start_1)
    start_2=time.time()    
    pool = Pool(processes=2)
    pool.map(get_loupan,urls)
    end_2 = time.time()
    print('2进程爬虫:',end_2 - start_2) 
    start_3=time.time()     
    pool = Pool(processes=4)
    pool.map(get_loupan,urls)
    end_3 = time.time()
    print('4进程爬虫:',end_3 - start_3)  
'''       
import pandas as pd
df=pd.DataFrame(total)
df.to_excel('anjuke.xls')
'''
            
'''    
import re
mianji_r=re.findall('<span>建筑面积:(.*?)</span>',res.text)
print(mianji_r)
'''

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值